Merge branch 'akpm' (patches from Andrew)
Merge second patch-bomb from Andrew Morton: "Almost all of the rest of MM. There was an unusually large amount of MM material this time" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (141 commits) zpool: remove no-op module init/exit mm: zbud: constify the zbud_ops mm: zpool: constify the zpool_ops mm: swap: zswap: maybe_preload & refactoring zram: unify error reporting zsmalloc: remove null check from destroy_handle_cache() zsmalloc: do not take class lock in zs_shrinker_count() zsmalloc: use class->pages_per_zspage zsmalloc: consider ZS_ALMOST_FULL as migrate source zsmalloc: partial page ordering within a fullness_list zsmalloc: use shrinker to trigger auto-compaction zsmalloc: account the number of compacted pages zsmalloc/zram: introduce zs_pool_stats api zsmalloc: cosmetic compaction code adjustments zsmalloc: introduce zs_can_compact() function zsmalloc: always keep per-class stats zsmalloc: drop unused variable `nr_to_migrate' mm/memblock.c: fix comment in __next_mem_range() mm/page_alloc.c: fix type information of memoryless node memory-hotplug: fix comments in zone_spanned_pages_in_node() and zone_spanned_pages_in_node() ...hifive-unleashed-5.1
commit
f6f7a63692
|
@ -104,6 +104,13 @@ crossing restrictions, pass 0 for alloc; passing 4096 says memory allocated
|
||||||
from this pool must not cross 4KByte boundaries.
|
from this pool must not cross 4KByte boundaries.
|
||||||
|
|
||||||
|
|
||||||
|
void *dma_pool_zalloc(struct dma_pool *pool, gfp_t mem_flags,
|
||||||
|
dma_addr_t *handle)
|
||||||
|
|
||||||
|
Wraps dma_pool_alloc() and also zeroes the returned memory if the
|
||||||
|
allocation attempt succeeded.
|
||||||
|
|
||||||
|
|
||||||
void *dma_pool_alloc(struct dma_pool *pool, gfp_t gfp_flags,
|
void *dma_pool_alloc(struct dma_pool *pool, gfp_t gfp_flags,
|
||||||
dma_addr_t *dma_handle);
|
dma_addr_t *dma_handle);
|
||||||
|
|
||||||
|
|
|
@ -144,7 +144,8 @@ mem_used_max RW the maximum amount memory zram have consumed to
|
||||||
store compressed data
|
store compressed data
|
||||||
mem_limit RW the maximum amount of memory ZRAM can use to store
|
mem_limit RW the maximum amount of memory ZRAM can use to store
|
||||||
the compressed data
|
the compressed data
|
||||||
num_migrated RO the number of objects migrated migrated by compaction
|
pages_compacted RO the number of pages freed during compaction
|
||||||
|
(available only via zram<id>/mm_stat node)
|
||||||
compact WO trigger memory compaction
|
compact WO trigger memory compaction
|
||||||
|
|
||||||
WARNING
|
WARNING
|
||||||
|
|
|
@ -60,9 +60,10 @@ Filesystem support consists of
|
||||||
- implementing the direct_IO address space operation, and calling
|
- implementing the direct_IO address space operation, and calling
|
||||||
dax_do_io() instead of blockdev_direct_IO() if S_DAX is set
|
dax_do_io() instead of blockdev_direct_IO() if S_DAX is set
|
||||||
- implementing an mmap file operation for DAX files which sets the
|
- implementing an mmap file operation for DAX files which sets the
|
||||||
VM_MIXEDMAP flag on the VMA, and setting the vm_ops to include handlers
|
VM_MIXEDMAP and VM_HUGEPAGE flags on the VMA, and setting the vm_ops to
|
||||||
for fault and page_mkwrite (which should probably call dax_fault() and
|
include handlers for fault, pmd_fault and page_mkwrite (which should
|
||||||
dax_mkwrite(), passing the appropriate get_block() callback)
|
probably call dax_fault(), dax_pmd_fault() and dax_mkwrite(), passing the
|
||||||
|
appropriate get_block() callback)
|
||||||
- calling dax_truncate_page() instead of block_truncate_page() for DAX files
|
- calling dax_truncate_page() instead of block_truncate_page() for DAX files
|
||||||
- calling dax_zero_page_range() instead of zero_user() for DAX files
|
- calling dax_zero_page_range() instead of zero_user() for DAX files
|
||||||
- ensuring that there is sufficient locking between reads, writes,
|
- ensuring that there is sufficient locking between reads, writes,
|
||||||
|
|
|
@ -424,6 +424,7 @@ Private_Dirty: 0 kB
|
||||||
Referenced: 892 kB
|
Referenced: 892 kB
|
||||||
Anonymous: 0 kB
|
Anonymous: 0 kB
|
||||||
Swap: 0 kB
|
Swap: 0 kB
|
||||||
|
SwapPss: 0 kB
|
||||||
KernelPageSize: 4 kB
|
KernelPageSize: 4 kB
|
||||||
MMUPageSize: 4 kB
|
MMUPageSize: 4 kB
|
||||||
Locked: 374 kB
|
Locked: 374 kB
|
||||||
|
@ -433,16 +434,23 @@ the first of these lines shows the same information as is displayed for the
|
||||||
mapping in /proc/PID/maps. The remaining lines show the size of the mapping
|
mapping in /proc/PID/maps. The remaining lines show the size of the mapping
|
||||||
(size), the amount of the mapping that is currently resident in RAM (RSS), the
|
(size), the amount of the mapping that is currently resident in RAM (RSS), the
|
||||||
process' proportional share of this mapping (PSS), the number of clean and
|
process' proportional share of this mapping (PSS), the number of clean and
|
||||||
dirty private pages in the mapping. Note that even a page which is part of a
|
dirty private pages in the mapping.
|
||||||
MAP_SHARED mapping, but has only a single pte mapped, i.e. is currently used
|
|
||||||
by only one process, is accounted as private and not as shared. "Referenced"
|
The "proportional set size" (PSS) of a process is the count of pages it has
|
||||||
indicates the amount of memory currently marked as referenced or accessed.
|
in memory, where each page is divided by the number of processes sharing it.
|
||||||
|
So if a process has 1000 pages all to itself, and 1000 shared with one other
|
||||||
|
process, its PSS will be 1500.
|
||||||
|
Note that even a page which is part of a MAP_SHARED mapping, but has only
|
||||||
|
a single pte mapped, i.e. is currently used by only one process, is accounted
|
||||||
|
as private and not as shared.
|
||||||
|
"Referenced" indicates the amount of memory currently marked as referenced or
|
||||||
|
accessed.
|
||||||
"Anonymous" shows the amount of memory that does not belong to any file. Even
|
"Anonymous" shows the amount of memory that does not belong to any file. Even
|
||||||
a mapping associated with a file may contain anonymous pages: when MAP_PRIVATE
|
a mapping associated with a file may contain anonymous pages: when MAP_PRIVATE
|
||||||
and a page is modified, the file page is replaced by a private anonymous copy.
|
and a page is modified, the file page is replaced by a private anonymous copy.
|
||||||
"Swap" shows how much would-be-anonymous memory is also used, but out on
|
"Swap" shows how much would-be-anonymous memory is also used, but out on
|
||||||
swap.
|
swap.
|
||||||
|
"SwapPss" shows proportional swap share of this mapping.
|
||||||
"VmFlags" field deserves a separate description. This member represents the kernel
|
"VmFlags" field deserves a separate description. This member represents the kernel
|
||||||
flags associated with the particular virtual memory area in two letter encoded
|
flags associated with the particular virtual memory area in two letter encoded
|
||||||
manner. The codes are the following:
|
manner. The codes are the following:
|
||||||
|
|
|
@ -349,7 +349,7 @@ zone[i]'s protection[j] is calculated by following expression.
|
||||||
|
|
||||||
(i < j):
|
(i < j):
|
||||||
zone[i]->protection[j]
|
zone[i]->protection[j]
|
||||||
= (total sums of present_pages from zone[i+1] to zone[j] on the node)
|
= (total sums of managed_pages from zone[i+1] to zone[j] on the node)
|
||||||
/ lowmem_reserve_ratio[i];
|
/ lowmem_reserve_ratio[i];
|
||||||
(i = j):
|
(i = j):
|
||||||
(should not be protected. = 0;
|
(should not be protected. = 0;
|
||||||
|
@ -360,7 +360,7 @@ The default values of lowmem_reserve_ratio[i] are
|
||||||
256 (if zone[i] means DMA or DMA32 zone)
|
256 (if zone[i] means DMA or DMA32 zone)
|
||||||
32 (others).
|
32 (others).
|
||||||
As above expression, they are reciprocal number of ratio.
|
As above expression, they are reciprocal number of ratio.
|
||||||
256 means 1/256. # of protection pages becomes about "0.39%" of total present
|
256 means 1/256. # of protection pages becomes about "0.39%" of total managed
|
||||||
pages of higher zones on the node.
|
pages of higher zones on the node.
|
||||||
|
|
||||||
If you would like to protect more pages, smaller values are effective.
|
If you would like to protect more pages, smaller values are effective.
|
||||||
|
|
|
@ -75,7 +75,8 @@ On all - write a character to /proc/sysrq-trigger. e.g.:
|
||||||
|
|
||||||
'e' - Send a SIGTERM to all processes, except for init.
|
'e' - Send a SIGTERM to all processes, except for init.
|
||||||
|
|
||||||
'f' - Will call oom_kill to kill a memory hog process.
|
'f' - Will call the oom killer to kill a memory hog process, but do not
|
||||||
|
panic if nothing can be killed.
|
||||||
|
|
||||||
'g' - Used by kgdb (kernel debugger)
|
'g' - Used by kgdb (kernel debugger)
|
||||||
|
|
||||||
|
|
|
@ -329,7 +329,14 @@ Examples
|
||||||
|
|
||||||
3) hugepage-mmap: see tools/testing/selftests/vm/hugepage-mmap.c
|
3) hugepage-mmap: see tools/testing/selftests/vm/hugepage-mmap.c
|
||||||
|
|
||||||
4) The libhugetlbfs (http://libhugetlbfs.sourceforge.net) library provides a
|
4) The libhugetlbfs (https://github.com/libhugetlbfs/libhugetlbfs) library
|
||||||
wide range of userspace tools to help with huge page usability, environment
|
provides a wide range of userspace tools to help with huge page usability,
|
||||||
setup, and control. Furthermore it provides useful test cases that should be
|
environment setup, and control.
|
||||||
used when modifying code to ensure no regressions are introduced.
|
|
||||||
|
Kernel development regression testing
|
||||||
|
=====================================
|
||||||
|
|
||||||
|
The most complete set of hugetlb tests are in the libhugetlbfs repository.
|
||||||
|
If you modify any hugetlb related code, use the libhugetlbfs test suite
|
||||||
|
to check for regressions. In addition, if you add any new hugetlb
|
||||||
|
functionality, please add appropriate tests to libhugetlbfs.
|
||||||
|
|
|
@ -16,11 +16,17 @@ There are three components to pagemap:
|
||||||
* Bits 0-4 swap type if swapped
|
* Bits 0-4 swap type if swapped
|
||||||
* Bits 5-54 swap offset if swapped
|
* Bits 5-54 swap offset if swapped
|
||||||
* Bit 55 pte is soft-dirty (see Documentation/vm/soft-dirty.txt)
|
* Bit 55 pte is soft-dirty (see Documentation/vm/soft-dirty.txt)
|
||||||
* Bits 56-60 zero
|
* Bit 56 page exclusively mapped (since 4.2)
|
||||||
* Bit 61 page is file-page or shared-anon
|
* Bits 57-60 zero
|
||||||
|
* Bit 61 page is file-page or shared-anon (since 3.5)
|
||||||
* Bit 62 page swapped
|
* Bit 62 page swapped
|
||||||
* Bit 63 page present
|
* Bit 63 page present
|
||||||
|
|
||||||
|
Since Linux 4.0 only users with the CAP_SYS_ADMIN capability can get PFNs.
|
||||||
|
In 4.0 and 4.1 opens by unprivileged fail with -EPERM. Starting from
|
||||||
|
4.2 the PFN field is zeroed if the user does not have CAP_SYS_ADMIN.
|
||||||
|
Reason: information about PFNs helps in exploiting Rowhammer vulnerability.
|
||||||
|
|
||||||
If the page is not present but in swap, then the PFN contains an
|
If the page is not present but in swap, then the PFN contains an
|
||||||
encoding of the swap file number and the page's offset into the
|
encoding of the swap file number and the page's offset into the
|
||||||
swap. Unmapped pages return a null PFN. This allows determining
|
swap. Unmapped pages return a null PFN. This allows determining
|
||||||
|
@ -159,3 +165,8 @@ Other notes:
|
||||||
Reading from any of the files will return -EINVAL if you are not starting
|
Reading from any of the files will return -EINVAL if you are not starting
|
||||||
the read on an 8-byte boundary (e.g., if you sought an odd number of bytes
|
the read on an 8-byte boundary (e.g., if you sought an odd number of bytes
|
||||||
into the file), or if the size of the read is not a multiple of 8 bytes.
|
into the file), or if the size of the read is not a multiple of 8 bytes.
|
||||||
|
|
||||||
|
Before Linux 3.11 pagemap bits 55-60 were used for "page-shift" (which is
|
||||||
|
always 12 at most architectures). Since Linux 3.11 their meaning changes
|
||||||
|
after first clear of soft-dirty bits. Since Linux 4.2 they are used for
|
||||||
|
flags unconditionally.
|
||||||
|
|
|
@ -339,6 +339,67 @@ static void __init request_standard_resources(void)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef CONFIG_BLK_DEV_INITRD
|
||||||
|
/*
|
||||||
|
* Relocate initrd if it is not completely within the linear mapping.
|
||||||
|
* This would be the case if mem= cuts out all or part of it.
|
||||||
|
*/
|
||||||
|
static void __init relocate_initrd(void)
|
||||||
|
{
|
||||||
|
phys_addr_t orig_start = __virt_to_phys(initrd_start);
|
||||||
|
phys_addr_t orig_end = __virt_to_phys(initrd_end);
|
||||||
|
phys_addr_t ram_end = memblock_end_of_DRAM();
|
||||||
|
phys_addr_t new_start;
|
||||||
|
unsigned long size, to_free = 0;
|
||||||
|
void *dest;
|
||||||
|
|
||||||
|
if (orig_end <= ram_end)
|
||||||
|
return;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Any of the original initrd which overlaps the linear map should
|
||||||
|
* be freed after relocating.
|
||||||
|
*/
|
||||||
|
if (orig_start < ram_end)
|
||||||
|
to_free = ram_end - orig_start;
|
||||||
|
|
||||||
|
size = orig_end - orig_start;
|
||||||
|
|
||||||
|
/* initrd needs to be relocated completely inside linear mapping */
|
||||||
|
new_start = memblock_find_in_range(0, PFN_PHYS(max_pfn),
|
||||||
|
size, PAGE_SIZE);
|
||||||
|
if (!new_start)
|
||||||
|
panic("Cannot relocate initrd of size %ld\n", size);
|
||||||
|
memblock_reserve(new_start, size);
|
||||||
|
|
||||||
|
initrd_start = __phys_to_virt(new_start);
|
||||||
|
initrd_end = initrd_start + size;
|
||||||
|
|
||||||
|
pr_info("Moving initrd from [%llx-%llx] to [%llx-%llx]\n",
|
||||||
|
orig_start, orig_start + size - 1,
|
||||||
|
new_start, new_start + size - 1);
|
||||||
|
|
||||||
|
dest = (void *)initrd_start;
|
||||||
|
|
||||||
|
if (to_free) {
|
||||||
|
memcpy(dest, (void *)__phys_to_virt(orig_start), to_free);
|
||||||
|
dest += to_free;
|
||||||
|
}
|
||||||
|
|
||||||
|
copy_from_early_mem(dest, orig_start + to_free, size - to_free);
|
||||||
|
|
||||||
|
if (to_free) {
|
||||||
|
pr_info("Freeing original RAMDISK from [%llx-%llx]\n",
|
||||||
|
orig_start, orig_start + to_free - 1);
|
||||||
|
memblock_free(orig_start, to_free);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
static inline void __init relocate_initrd(void)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
u64 __cpu_logical_map[NR_CPUS] = { [0 ... NR_CPUS-1] = INVALID_HWID };
|
u64 __cpu_logical_map[NR_CPUS] = { [0 ... NR_CPUS-1] = INVALID_HWID };
|
||||||
|
|
||||||
void __init setup_arch(char **cmdline_p)
|
void __init setup_arch(char **cmdline_p)
|
||||||
|
@ -372,6 +433,7 @@ void __init setup_arch(char **cmdline_p)
|
||||||
acpi_boot_table_init();
|
acpi_boot_table_init();
|
||||||
|
|
||||||
paging_init();
|
paging_init();
|
||||||
|
relocate_initrd();
|
||||||
request_standard_resources();
|
request_standard_resources();
|
||||||
|
|
||||||
early_ioremap_reset();
|
early_ioremap_reset();
|
||||||
|
|
|
@ -1140,13 +1140,9 @@ sba_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
|
||||||
|
|
||||||
#ifdef CONFIG_NUMA
|
#ifdef CONFIG_NUMA
|
||||||
{
|
{
|
||||||
int node = ioc->node;
|
|
||||||
struct page *page;
|
struct page *page;
|
||||||
|
|
||||||
if (node == NUMA_NO_NODE)
|
page = alloc_pages_node(ioc->node, flags, get_order(size));
|
||||||
node = numa_node_id();
|
|
||||||
|
|
||||||
page = alloc_pages_exact_node(node, flags, get_order(size));
|
|
||||||
if (unlikely(!page))
|
if (unlikely(!page))
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
|
|
|
@ -97,7 +97,7 @@ static int uncached_add_chunk(struct uncached_pool *uc_pool, int nid)
|
||||||
|
|
||||||
/* attempt to allocate a granule's worth of cached memory pages */
|
/* attempt to allocate a granule's worth of cached memory pages */
|
||||||
|
|
||||||
page = alloc_pages_exact_node(nid,
|
page = __alloc_pages_node(nid,
|
||||||
GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
|
GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
|
||||||
IA64_GRANULE_SHIFT-PAGE_SHIFT);
|
IA64_GRANULE_SHIFT-PAGE_SHIFT);
|
||||||
if (!page) {
|
if (!page) {
|
||||||
|
|
|
@ -92,7 +92,7 @@ static void *sn_dma_alloc_coherent(struct device *dev, size_t size,
|
||||||
*/
|
*/
|
||||||
node = pcibus_to_node(pdev->bus);
|
node = pcibus_to_node(pdev->bus);
|
||||||
if (likely(node >=0)) {
|
if (likely(node >=0)) {
|
||||||
struct page *p = alloc_pages_exact_node(node,
|
struct page *p = __alloc_pages_node(node,
|
||||||
flags, get_order(size));
|
flags, get_order(size));
|
||||||
|
|
||||||
if (likely(p))
|
if (likely(p))
|
||||||
|
|
|
@ -123,7 +123,7 @@ static int __init cbe_ptcal_enable_on_node(int nid, int order)
|
||||||
|
|
||||||
area->nid = nid;
|
area->nid = nid;
|
||||||
area->order = order;
|
area->order = order;
|
||||||
area->pages = alloc_pages_exact_node(area->nid,
|
area->pages = __alloc_pages_node(area->nid,
|
||||||
GFP_KERNEL|__GFP_THISNODE,
|
GFP_KERNEL|__GFP_THISNODE,
|
||||||
area->order);
|
area->order);
|
||||||
|
|
||||||
|
|
|
@ -14,7 +14,7 @@
|
||||||
#include <asm-generic/4level-fixup.h>
|
#include <asm-generic/4level-fixup.h>
|
||||||
|
|
||||||
#include <linux/spinlock.h>
|
#include <linux/spinlock.h>
|
||||||
#include <linux/swap.h>
|
#include <linux/mm_types.h>
|
||||||
#include <asm/types.h>
|
#include <asm/types.h>
|
||||||
#include <asm/pgtsrmmu.h>
|
#include <asm/pgtsrmmu.h>
|
||||||
#include <asm/vaddrs.h>
|
#include <asm/vaddrs.h>
|
||||||
|
|
|
@ -317,15 +317,12 @@ static u64 __init get_ramdisk_size(void)
|
||||||
return ramdisk_size;
|
return ramdisk_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
|
|
||||||
static void __init relocate_initrd(void)
|
static void __init relocate_initrd(void)
|
||||||
{
|
{
|
||||||
/* Assume only end is not page aligned */
|
/* Assume only end is not page aligned */
|
||||||
u64 ramdisk_image = get_ramdisk_image();
|
u64 ramdisk_image = get_ramdisk_image();
|
||||||
u64 ramdisk_size = get_ramdisk_size();
|
u64 ramdisk_size = get_ramdisk_size();
|
||||||
u64 area_size = PAGE_ALIGN(ramdisk_size);
|
u64 area_size = PAGE_ALIGN(ramdisk_size);
|
||||||
unsigned long slop, clen, mapaddr;
|
|
||||||
char *p, *q;
|
|
||||||
|
|
||||||
/* We need to move the initrd down into directly mapped mem */
|
/* We need to move the initrd down into directly mapped mem */
|
||||||
relocated_ramdisk = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
|
relocated_ramdisk = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
|
||||||
|
@ -343,25 +340,8 @@ static void __init relocate_initrd(void)
|
||||||
printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n",
|
printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n",
|
||||||
relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1);
|
relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1);
|
||||||
|
|
||||||
q = (char *)initrd_start;
|
copy_from_early_mem((void *)initrd_start, ramdisk_image, ramdisk_size);
|
||||||
|
|
||||||
/* Copy the initrd */
|
|
||||||
while (ramdisk_size) {
|
|
||||||
slop = ramdisk_image & ~PAGE_MASK;
|
|
||||||
clen = ramdisk_size;
|
|
||||||
if (clen > MAX_MAP_CHUNK-slop)
|
|
||||||
clen = MAX_MAP_CHUNK-slop;
|
|
||||||
mapaddr = ramdisk_image & PAGE_MASK;
|
|
||||||
p = early_memremap(mapaddr, clen+slop);
|
|
||||||
memcpy(q, p+slop, clen);
|
|
||||||
early_memunmap(p, clen+slop);
|
|
||||||
q += clen;
|
|
||||||
ramdisk_image += clen;
|
|
||||||
ramdisk_size -= clen;
|
|
||||||
}
|
|
||||||
|
|
||||||
ramdisk_image = get_ramdisk_image();
|
|
||||||
ramdisk_size = get_ramdisk_size();
|
|
||||||
printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to"
|
printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to"
|
||||||
" [mem %#010llx-%#010llx]\n",
|
" [mem %#010llx-%#010llx]\n",
|
||||||
ramdisk_image, ramdisk_image + ramdisk_size - 1,
|
ramdisk_image, ramdisk_image + ramdisk_size - 1,
|
||||||
|
|
|
@ -3150,7 +3150,7 @@ static struct vmcs *alloc_vmcs_cpu(int cpu)
|
||||||
struct page *pages;
|
struct page *pages;
|
||||||
struct vmcs *vmcs;
|
struct vmcs *vmcs;
|
||||||
|
|
||||||
pages = alloc_pages_exact_node(node, GFP_KERNEL, vmcs_config.order);
|
pages = __alloc_pages_node(node, GFP_KERNEL, vmcs_config.order);
|
||||||
if (!pages)
|
if (!pages)
|
||||||
return NULL;
|
return NULL;
|
||||||
vmcs = page_address(pages);
|
vmcs = page_address(pages);
|
||||||
|
|
|
@ -246,8 +246,10 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
|
||||||
bi->start = max(bi->start, low);
|
bi->start = max(bi->start, low);
|
||||||
bi->end = min(bi->end, high);
|
bi->end = min(bi->end, high);
|
||||||
|
|
||||||
/* and there's no empty block */
|
/* and there's no empty or non-exist block */
|
||||||
if (bi->start >= bi->end)
|
if (bi->start >= bi->end ||
|
||||||
|
!memblock_overlaps_region(&memblock.memory,
|
||||||
|
bi->start, bi->end - bi->start))
|
||||||
numa_remove_memblk_from(i--, mi);
|
numa_remove_memblk_from(i--, mi);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -388,7 +388,6 @@ static ssize_t comp_algorithm_store(struct device *dev,
|
||||||
static ssize_t compact_store(struct device *dev,
|
static ssize_t compact_store(struct device *dev,
|
||||||
struct device_attribute *attr, const char *buf, size_t len)
|
struct device_attribute *attr, const char *buf, size_t len)
|
||||||
{
|
{
|
||||||
unsigned long nr_migrated;
|
|
||||||
struct zram *zram = dev_to_zram(dev);
|
struct zram *zram = dev_to_zram(dev);
|
||||||
struct zram_meta *meta;
|
struct zram_meta *meta;
|
||||||
|
|
||||||
|
@ -399,8 +398,7 @@ static ssize_t compact_store(struct device *dev,
|
||||||
}
|
}
|
||||||
|
|
||||||
meta = zram->meta;
|
meta = zram->meta;
|
||||||
nr_migrated = zs_compact(meta->mem_pool);
|
zs_compact(meta->mem_pool);
|
||||||
atomic64_add(nr_migrated, &zram->stats.num_migrated);
|
|
||||||
up_read(&zram->init_lock);
|
up_read(&zram->init_lock);
|
||||||
|
|
||||||
return len;
|
return len;
|
||||||
|
@ -428,26 +426,31 @@ static ssize_t mm_stat_show(struct device *dev,
|
||||||
struct device_attribute *attr, char *buf)
|
struct device_attribute *attr, char *buf)
|
||||||
{
|
{
|
||||||
struct zram *zram = dev_to_zram(dev);
|
struct zram *zram = dev_to_zram(dev);
|
||||||
|
struct zs_pool_stats pool_stats;
|
||||||
u64 orig_size, mem_used = 0;
|
u64 orig_size, mem_used = 0;
|
||||||
long max_used;
|
long max_used;
|
||||||
ssize_t ret;
|
ssize_t ret;
|
||||||
|
|
||||||
|
memset(&pool_stats, 0x00, sizeof(struct zs_pool_stats));
|
||||||
|
|
||||||
down_read(&zram->init_lock);
|
down_read(&zram->init_lock);
|
||||||
if (init_done(zram))
|
if (init_done(zram)) {
|
||||||
mem_used = zs_get_total_pages(zram->meta->mem_pool);
|
mem_used = zs_get_total_pages(zram->meta->mem_pool);
|
||||||
|
zs_pool_stats(zram->meta->mem_pool, &pool_stats);
|
||||||
|
}
|
||||||
|
|
||||||
orig_size = atomic64_read(&zram->stats.pages_stored);
|
orig_size = atomic64_read(&zram->stats.pages_stored);
|
||||||
max_used = atomic_long_read(&zram->stats.max_used_pages);
|
max_used = atomic_long_read(&zram->stats.max_used_pages);
|
||||||
|
|
||||||
ret = scnprintf(buf, PAGE_SIZE,
|
ret = scnprintf(buf, PAGE_SIZE,
|
||||||
"%8llu %8llu %8llu %8lu %8ld %8llu %8llu\n",
|
"%8llu %8llu %8llu %8lu %8ld %8llu %8lu\n",
|
||||||
orig_size << PAGE_SHIFT,
|
orig_size << PAGE_SHIFT,
|
||||||
(u64)atomic64_read(&zram->stats.compr_data_size),
|
(u64)atomic64_read(&zram->stats.compr_data_size),
|
||||||
mem_used << PAGE_SHIFT,
|
mem_used << PAGE_SHIFT,
|
||||||
zram->limit_pages << PAGE_SHIFT,
|
zram->limit_pages << PAGE_SHIFT,
|
||||||
max_used << PAGE_SHIFT,
|
max_used << PAGE_SHIFT,
|
||||||
(u64)atomic64_read(&zram->stats.zero_pages),
|
(u64)atomic64_read(&zram->stats.zero_pages),
|
||||||
(u64)atomic64_read(&zram->stats.num_migrated));
|
pool_stats.pages_compacted);
|
||||||
up_read(&zram->init_lock);
|
up_read(&zram->init_lock);
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
|
@ -619,7 +622,7 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
|
||||||
uncmem = user_mem;
|
uncmem = user_mem;
|
||||||
|
|
||||||
if (!uncmem) {
|
if (!uncmem) {
|
||||||
pr_info("Unable to allocate temp memory\n");
|
pr_err("Unable to allocate temp memory\n");
|
||||||
ret = -ENOMEM;
|
ret = -ENOMEM;
|
||||||
goto out_cleanup;
|
goto out_cleanup;
|
||||||
}
|
}
|
||||||
|
@ -716,7 +719,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
|
||||||
|
|
||||||
handle = zs_malloc(meta->mem_pool, clen);
|
handle = zs_malloc(meta->mem_pool, clen);
|
||||||
if (!handle) {
|
if (!handle) {
|
||||||
pr_info("Error allocating memory for compressed page: %u, size=%zu\n",
|
pr_err("Error allocating memory for compressed page: %u, size=%zu\n",
|
||||||
index, clen);
|
index, clen);
|
||||||
ret = -ENOMEM;
|
ret = -ENOMEM;
|
||||||
goto out;
|
goto out;
|
||||||
|
@ -1036,7 +1039,7 @@ static ssize_t disksize_store(struct device *dev,
|
||||||
|
|
||||||
comp = zcomp_create(zram->compressor, zram->max_comp_streams);
|
comp = zcomp_create(zram->compressor, zram->max_comp_streams);
|
||||||
if (IS_ERR(comp)) {
|
if (IS_ERR(comp)) {
|
||||||
pr_info("Cannot initialise %s compressing backend\n",
|
pr_err("Cannot initialise %s compressing backend\n",
|
||||||
zram->compressor);
|
zram->compressor);
|
||||||
err = PTR_ERR(comp);
|
err = PTR_ERR(comp);
|
||||||
goto out_free_meta;
|
goto out_free_meta;
|
||||||
|
@ -1214,7 +1217,7 @@ static int zram_add(void)
|
||||||
/* gendisk structure */
|
/* gendisk structure */
|
||||||
zram->disk = alloc_disk(1);
|
zram->disk = alloc_disk(1);
|
||||||
if (!zram->disk) {
|
if (!zram->disk) {
|
||||||
pr_warn("Error allocating disk structure for device %d\n",
|
pr_err("Error allocating disk structure for device %d\n",
|
||||||
device_id);
|
device_id);
|
||||||
ret = -ENOMEM;
|
ret = -ENOMEM;
|
||||||
goto out_free_queue;
|
goto out_free_queue;
|
||||||
|
@ -1263,7 +1266,8 @@ static int zram_add(void)
|
||||||
ret = sysfs_create_group(&disk_to_dev(zram->disk)->kobj,
|
ret = sysfs_create_group(&disk_to_dev(zram->disk)->kobj,
|
||||||
&zram_disk_attr_group);
|
&zram_disk_attr_group);
|
||||||
if (ret < 0) {
|
if (ret < 0) {
|
||||||
pr_warn("Error creating sysfs group");
|
pr_err("Error creating sysfs group for device %d\n",
|
||||||
|
device_id);
|
||||||
goto out_free_disk;
|
goto out_free_disk;
|
||||||
}
|
}
|
||||||
strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor));
|
strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor));
|
||||||
|
@ -1403,13 +1407,13 @@ static int __init zram_init(void)
|
||||||
|
|
||||||
ret = class_register(&zram_control_class);
|
ret = class_register(&zram_control_class);
|
||||||
if (ret) {
|
if (ret) {
|
||||||
pr_warn("Unable to register zram-control class\n");
|
pr_err("Unable to register zram-control class\n");
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
zram_major = register_blkdev(0, "zram");
|
zram_major = register_blkdev(0, "zram");
|
||||||
if (zram_major <= 0) {
|
if (zram_major <= 0) {
|
||||||
pr_warn("Unable to get major number\n");
|
pr_err("Unable to get major number\n");
|
||||||
class_unregister(&zram_control_class);
|
class_unregister(&zram_control_class);
|
||||||
return -EBUSY;
|
return -EBUSY;
|
||||||
}
|
}
|
||||||
|
|
|
@ -78,7 +78,6 @@ struct zram_stats {
|
||||||
atomic64_t compr_data_size; /* compressed size of pages stored */
|
atomic64_t compr_data_size; /* compressed size of pages stored */
|
||||||
atomic64_t num_reads; /* failed + successful */
|
atomic64_t num_reads; /* failed + successful */
|
||||||
atomic64_t num_writes; /* --do-- */
|
atomic64_t num_writes; /* --do-- */
|
||||||
atomic64_t num_migrated; /* no. of migrated object */
|
|
||||||
atomic64_t failed_reads; /* can happen when memory is too low */
|
atomic64_t failed_reads; /* can happen when memory is too low */
|
||||||
atomic64_t failed_writes; /* can happen when memory is too low */
|
atomic64_t failed_writes; /* can happen when memory is too low */
|
||||||
atomic64_t invalid_io; /* non-page-aligned I/O requests */
|
atomic64_t invalid_io; /* non-page-aligned I/O requests */
|
||||||
|
|
|
@ -239,7 +239,7 @@ xpc_create_gru_mq_uv(unsigned int mq_size, int cpu, char *irq_name,
|
||||||
mq->mmr_blade = uv_cpu_to_blade_id(cpu);
|
mq->mmr_blade = uv_cpu_to_blade_id(cpu);
|
||||||
|
|
||||||
nid = cpu_to_node(cpu);
|
nid = cpu_to_node(cpu);
|
||||||
page = alloc_pages_exact_node(nid,
|
page = __alloc_pages_node(nid,
|
||||||
GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
|
GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
|
||||||
pg_order);
|
pg_order);
|
||||||
if (page == NULL) {
|
if (page == NULL) {
|
||||||
|
|
|
@ -353,9 +353,16 @@ static struct sysrq_key_op sysrq_term_op = {
|
||||||
|
|
||||||
static void moom_callback(struct work_struct *ignored)
|
static void moom_callback(struct work_struct *ignored)
|
||||||
{
|
{
|
||||||
|
const gfp_t gfp_mask = GFP_KERNEL;
|
||||||
|
struct oom_control oc = {
|
||||||
|
.zonelist = node_zonelist(first_memory_node, gfp_mask),
|
||||||
|
.nodemask = NULL,
|
||||||
|
.gfp_mask = gfp_mask,
|
||||||
|
.order = -1,
|
||||||
|
};
|
||||||
|
|
||||||
mutex_lock(&oom_lock);
|
mutex_lock(&oom_lock);
|
||||||
if (!out_of_memory(node_zonelist(first_memory_node, GFP_KERNEL),
|
if (!out_of_memory(&oc))
|
||||||
GFP_KERNEL, 0, NULL, true))
|
|
||||||
pr_info("OOM request ignored because killer is disabled\n");
|
pr_info("OOM request ignored because killer is disabled\n");
|
||||||
mutex_unlock(&oom_lock);
|
mutex_unlock(&oom_lock);
|
||||||
}
|
}
|
||||||
|
|
|
@ -28,6 +28,7 @@
|
||||||
#include <linux/namei.h>
|
#include <linux/namei.h>
|
||||||
#include <linux/log2.h>
|
#include <linux/log2.h>
|
||||||
#include <linux/cleancache.h>
|
#include <linux/cleancache.h>
|
||||||
|
#include <linux/dax.h>
|
||||||
#include <asm/uaccess.h>
|
#include <asm/uaccess.h>
|
||||||
#include "internal.h"
|
#include "internal.h"
|
||||||
|
|
||||||
|
|
197
fs/dax.c
197
fs/dax.c
|
@ -283,7 +283,6 @@ static int copy_user_bh(struct page *to, struct buffer_head *bh,
|
||||||
static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
|
static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
|
||||||
struct vm_area_struct *vma, struct vm_fault *vmf)
|
struct vm_area_struct *vma, struct vm_fault *vmf)
|
||||||
{
|
{
|
||||||
struct address_space *mapping = inode->i_mapping;
|
|
||||||
sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
|
sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
|
||||||
unsigned long vaddr = (unsigned long)vmf->virtual_address;
|
unsigned long vaddr = (unsigned long)vmf->virtual_address;
|
||||||
void __pmem *addr;
|
void __pmem *addr;
|
||||||
|
@ -291,8 +290,6 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
|
||||||
pgoff_t size;
|
pgoff_t size;
|
||||||
int error;
|
int error;
|
||||||
|
|
||||||
i_mmap_lock_read(mapping);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Check truncate didn't happen while we were allocating a block.
|
* Check truncate didn't happen while we were allocating a block.
|
||||||
* If it did, this block may or may not be still allocated to the
|
* If it did, this block may or may not be still allocated to the
|
||||||
|
@ -322,8 +319,6 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
|
||||||
error = vm_insert_mixed(vma, vaddr, pfn);
|
error = vm_insert_mixed(vma, vaddr, pfn);
|
||||||
|
|
||||||
out:
|
out:
|
||||||
i_mmap_unlock_read(mapping);
|
|
||||||
|
|
||||||
return error;
|
return error;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -385,15 +380,17 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
|
||||||
* from a read fault and we've raced with a truncate
|
* from a read fault and we've raced with a truncate
|
||||||
*/
|
*/
|
||||||
error = -EIO;
|
error = -EIO;
|
||||||
goto unlock_page;
|
goto unlock;
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
i_mmap_lock_write(mapping);
|
||||||
}
|
}
|
||||||
|
|
||||||
error = get_block(inode, block, &bh, 0);
|
error = get_block(inode, block, &bh, 0);
|
||||||
if (!error && (bh.b_size < PAGE_SIZE))
|
if (!error && (bh.b_size < PAGE_SIZE))
|
||||||
error = -EIO; /* fs corruption? */
|
error = -EIO; /* fs corruption? */
|
||||||
if (error)
|
if (error)
|
||||||
goto unlock_page;
|
goto unlock;
|
||||||
|
|
||||||
if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) {
|
if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) {
|
||||||
if (vmf->flags & FAULT_FLAG_WRITE) {
|
if (vmf->flags & FAULT_FLAG_WRITE) {
|
||||||
|
@ -404,8 +401,9 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
|
||||||
if (!error && (bh.b_size < PAGE_SIZE))
|
if (!error && (bh.b_size < PAGE_SIZE))
|
||||||
error = -EIO;
|
error = -EIO;
|
||||||
if (error)
|
if (error)
|
||||||
goto unlock_page;
|
goto unlock;
|
||||||
} else {
|
} else {
|
||||||
|
i_mmap_unlock_write(mapping);
|
||||||
return dax_load_hole(mapping, page, vmf);
|
return dax_load_hole(mapping, page, vmf);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -417,17 +415,15 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
|
||||||
else
|
else
|
||||||
clear_user_highpage(new_page, vaddr);
|
clear_user_highpage(new_page, vaddr);
|
||||||
if (error)
|
if (error)
|
||||||
goto unlock_page;
|
goto unlock;
|
||||||
vmf->page = page;
|
vmf->page = page;
|
||||||
if (!page) {
|
if (!page) {
|
||||||
i_mmap_lock_read(mapping);
|
|
||||||
/* Check we didn't race with truncate */
|
/* Check we didn't race with truncate */
|
||||||
size = (i_size_read(inode) + PAGE_SIZE - 1) >>
|
size = (i_size_read(inode) + PAGE_SIZE - 1) >>
|
||||||
PAGE_SHIFT;
|
PAGE_SHIFT;
|
||||||
if (vmf->pgoff >= size) {
|
if (vmf->pgoff >= size) {
|
||||||
i_mmap_unlock_read(mapping);
|
|
||||||
error = -EIO;
|
error = -EIO;
|
||||||
goto out;
|
goto unlock;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return VM_FAULT_LOCKED;
|
return VM_FAULT_LOCKED;
|
||||||
|
@ -463,6 +459,8 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
|
||||||
WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE));
|
WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!page)
|
||||||
|
i_mmap_unlock_write(mapping);
|
||||||
out:
|
out:
|
||||||
if (error == -ENOMEM)
|
if (error == -ENOMEM)
|
||||||
return VM_FAULT_OOM | major;
|
return VM_FAULT_OOM | major;
|
||||||
|
@ -471,11 +469,14 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
|
||||||
return VM_FAULT_SIGBUS | major;
|
return VM_FAULT_SIGBUS | major;
|
||||||
return VM_FAULT_NOPAGE | major;
|
return VM_FAULT_NOPAGE | major;
|
||||||
|
|
||||||
unlock_page:
|
unlock:
|
||||||
if (page) {
|
if (page) {
|
||||||
unlock_page(page);
|
unlock_page(page);
|
||||||
page_cache_release(page);
|
page_cache_release(page);
|
||||||
|
} else {
|
||||||
|
i_mmap_unlock_write(mapping);
|
||||||
}
|
}
|
||||||
|
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL(__dax_fault);
|
EXPORT_SYMBOL(__dax_fault);
|
||||||
|
@ -507,6 +508,176 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL_GPL(dax_fault);
|
EXPORT_SYMBOL_GPL(dax_fault);
|
||||||
|
|
||||||
|
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||||
|
/*
|
||||||
|
* The 'colour' (ie low bits) within a PMD of a page offset. This comes up
|
||||||
|
* more often than one might expect in the below function.
|
||||||
|
*/
|
||||||
|
#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
|
||||||
|
|
||||||
|
int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
|
||||||
|
pmd_t *pmd, unsigned int flags, get_block_t get_block,
|
||||||
|
dax_iodone_t complete_unwritten)
|
||||||
|
{
|
||||||
|
struct file *file = vma->vm_file;
|
||||||
|
struct address_space *mapping = file->f_mapping;
|
||||||
|
struct inode *inode = mapping->host;
|
||||||
|
struct buffer_head bh;
|
||||||
|
unsigned blkbits = inode->i_blkbits;
|
||||||
|
unsigned long pmd_addr = address & PMD_MASK;
|
||||||
|
bool write = flags & FAULT_FLAG_WRITE;
|
||||||
|
long length;
|
||||||
|
void *kaddr;
|
||||||
|
pgoff_t size, pgoff;
|
||||||
|
sector_t block, sector;
|
||||||
|
unsigned long pfn;
|
||||||
|
int result = 0;
|
||||||
|
|
||||||
|
/* Fall back to PTEs if we're going to COW */
|
||||||
|
if (write && !(vma->vm_flags & VM_SHARED))
|
||||||
|
return VM_FAULT_FALLBACK;
|
||||||
|
/* If the PMD would extend outside the VMA */
|
||||||
|
if (pmd_addr < vma->vm_start)
|
||||||
|
return VM_FAULT_FALLBACK;
|
||||||
|
if ((pmd_addr + PMD_SIZE) > vma->vm_end)
|
||||||
|
return VM_FAULT_FALLBACK;
|
||||||
|
|
||||||
|
pgoff = linear_page_index(vma, pmd_addr);
|
||||||
|
size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
|
||||||
|
if (pgoff >= size)
|
||||||
|
return VM_FAULT_SIGBUS;
|
||||||
|
/* If the PMD would cover blocks out of the file */
|
||||||
|
if ((pgoff | PG_PMD_COLOUR) >= size)
|
||||||
|
return VM_FAULT_FALLBACK;
|
||||||
|
|
||||||
|
memset(&bh, 0, sizeof(bh));
|
||||||
|
block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
|
||||||
|
|
||||||
|
bh.b_size = PMD_SIZE;
|
||||||
|
i_mmap_lock_write(mapping);
|
||||||
|
length = get_block(inode, block, &bh, write);
|
||||||
|
if (length)
|
||||||
|
return VM_FAULT_SIGBUS;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If the filesystem isn't willing to tell us the length of a hole,
|
||||||
|
* just fall back to PTEs. Calling get_block 512 times in a loop
|
||||||
|
* would be silly.
|
||||||
|
*/
|
||||||
|
if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE)
|
||||||
|
goto fallback;
|
||||||
|
|
||||||
|
if (buffer_unwritten(&bh) || buffer_new(&bh)) {
|
||||||
|
int i;
|
||||||
|
for (i = 0; i < PTRS_PER_PMD; i++)
|
||||||
|
clear_page(kaddr + i * PAGE_SIZE);
|
||||||
|
count_vm_event(PGMAJFAULT);
|
||||||
|
mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
|
||||||
|
result |= VM_FAULT_MAJOR;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If we allocated new storage, make sure no process has any
|
||||||
|
* zero pages covering this hole
|
||||||
|
*/
|
||||||
|
if (buffer_new(&bh)) {
|
||||||
|
i_mmap_unlock_write(mapping);
|
||||||
|
unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0);
|
||||||
|
i_mmap_lock_write(mapping);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If a truncate happened while we were allocating blocks, we may
|
||||||
|
* leave blocks allocated to the file that are beyond EOF. We can't
|
||||||
|
* take i_mutex here, so just leave them hanging; they'll be freed
|
||||||
|
* when the file is deleted.
|
||||||
|
*/
|
||||||
|
size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
|
||||||
|
if (pgoff >= size) {
|
||||||
|
result = VM_FAULT_SIGBUS;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
if ((pgoff | PG_PMD_COLOUR) >= size)
|
||||||
|
goto fallback;
|
||||||
|
|
||||||
|
if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) {
|
||||||
|
spinlock_t *ptl;
|
||||||
|
pmd_t entry;
|
||||||
|
struct page *zero_page = get_huge_zero_page();
|
||||||
|
|
||||||
|
if (unlikely(!zero_page))
|
||||||
|
goto fallback;
|
||||||
|
|
||||||
|
ptl = pmd_lock(vma->vm_mm, pmd);
|
||||||
|
if (!pmd_none(*pmd)) {
|
||||||
|
spin_unlock(ptl);
|
||||||
|
goto fallback;
|
||||||
|
}
|
||||||
|
|
||||||
|
entry = mk_pmd(zero_page, vma->vm_page_prot);
|
||||||
|
entry = pmd_mkhuge(entry);
|
||||||
|
set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry);
|
||||||
|
result = VM_FAULT_NOPAGE;
|
||||||
|
spin_unlock(ptl);
|
||||||
|
} else {
|
||||||
|
sector = bh.b_blocknr << (blkbits - 9);
|
||||||
|
length = bdev_direct_access(bh.b_bdev, sector, &kaddr, &pfn,
|
||||||
|
bh.b_size);
|
||||||
|
if (length < 0) {
|
||||||
|
result = VM_FAULT_SIGBUS;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
if ((length < PMD_SIZE) || (pfn & PG_PMD_COLOUR))
|
||||||
|
goto fallback;
|
||||||
|
|
||||||
|
result |= vmf_insert_pfn_pmd(vma, address, pmd, pfn, write);
|
||||||
|
}
|
||||||
|
|
||||||
|
out:
|
||||||
|
if (buffer_unwritten(&bh))
|
||||||
|
complete_unwritten(&bh, !(result & VM_FAULT_ERROR));
|
||||||
|
|
||||||
|
i_mmap_unlock_write(mapping);
|
||||||
|
|
||||||
|
return result;
|
||||||
|
|
||||||
|
fallback:
|
||||||
|
count_vm_event(THP_FAULT_FALLBACK);
|
||||||
|
result = VM_FAULT_FALLBACK;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL_GPL(__dax_pmd_fault);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* dax_pmd_fault - handle a PMD fault on a DAX file
|
||||||
|
* @vma: The virtual memory area where the fault occurred
|
||||||
|
* @vmf: The description of the fault
|
||||||
|
* @get_block: The filesystem method used to translate file offsets to blocks
|
||||||
|
*
|
||||||
|
* When a page fault occurs, filesystems may call this helper in their
|
||||||
|
* pmd_fault handler for DAX files.
|
||||||
|
*/
|
||||||
|
int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
|
||||||
|
pmd_t *pmd, unsigned int flags, get_block_t get_block,
|
||||||
|
dax_iodone_t complete_unwritten)
|
||||||
|
{
|
||||||
|
int result;
|
||||||
|
struct super_block *sb = file_inode(vma->vm_file)->i_sb;
|
||||||
|
|
||||||
|
if (flags & FAULT_FLAG_WRITE) {
|
||||||
|
sb_start_pagefault(sb);
|
||||||
|
file_update_time(vma->vm_file);
|
||||||
|
}
|
||||||
|
result = __dax_pmd_fault(vma, address, pmd, flags, get_block,
|
||||||
|
complete_unwritten);
|
||||||
|
if (flags & FAULT_FLAG_WRITE)
|
||||||
|
sb_end_pagefault(sb);
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL_GPL(dax_pmd_fault);
|
||||||
|
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* dax_pfn_mkwrite - handle first write to DAX page
|
* dax_pfn_mkwrite - handle first write to DAX page
|
||||||
* @vma: The virtual memory area where the fault occurred
|
* @vma: The virtual memory area where the fault occurred
|
||||||
|
|
|
@ -20,6 +20,7 @@
|
||||||
|
|
||||||
#include <linux/time.h>
|
#include <linux/time.h>
|
||||||
#include <linux/pagemap.h>
|
#include <linux/pagemap.h>
|
||||||
|
#include <linux/dax.h>
|
||||||
#include <linux/quotaops.h>
|
#include <linux/quotaops.h>
|
||||||
#include "ext2.h"
|
#include "ext2.h"
|
||||||
#include "xattr.h"
|
#include "xattr.h"
|
||||||
|
@ -31,6 +32,12 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
|
||||||
return dax_fault(vma, vmf, ext2_get_block, NULL);
|
return dax_fault(vma, vmf, ext2_get_block, NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int ext2_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
|
||||||
|
pmd_t *pmd, unsigned int flags)
|
||||||
|
{
|
||||||
|
return dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block, NULL);
|
||||||
|
}
|
||||||
|
|
||||||
static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
|
static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
|
||||||
{
|
{
|
||||||
return dax_mkwrite(vma, vmf, ext2_get_block, NULL);
|
return dax_mkwrite(vma, vmf, ext2_get_block, NULL);
|
||||||
|
@ -38,6 +45,7 @@ static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
|
||||||
|
|
||||||
static const struct vm_operations_struct ext2_dax_vm_ops = {
|
static const struct vm_operations_struct ext2_dax_vm_ops = {
|
||||||
.fault = ext2_dax_fault,
|
.fault = ext2_dax_fault,
|
||||||
|
.pmd_fault = ext2_dax_pmd_fault,
|
||||||
.page_mkwrite = ext2_dax_mkwrite,
|
.page_mkwrite = ext2_dax_mkwrite,
|
||||||
.pfn_mkwrite = dax_pfn_mkwrite,
|
.pfn_mkwrite = dax_pfn_mkwrite,
|
||||||
};
|
};
|
||||||
|
@ -49,7 +57,7 @@ static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma)
|
||||||
|
|
||||||
file_accessed(file);
|
file_accessed(file);
|
||||||
vma->vm_ops = &ext2_dax_vm_ops;
|
vma->vm_ops = &ext2_dax_vm_ops;
|
||||||
vma->vm_flags |= VM_MIXEDMAP;
|
vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
|
|
|
@ -25,6 +25,7 @@
|
||||||
#include <linux/time.h>
|
#include <linux/time.h>
|
||||||
#include <linux/highuid.h>
|
#include <linux/highuid.h>
|
||||||
#include <linux/pagemap.h>
|
#include <linux/pagemap.h>
|
||||||
|
#include <linux/dax.h>
|
||||||
#include <linux/quotaops.h>
|
#include <linux/quotaops.h>
|
||||||
#include <linux/writeback.h>
|
#include <linux/writeback.h>
|
||||||
#include <linux/buffer_head.h>
|
#include <linux/buffer_head.h>
|
||||||
|
|
|
@ -2272,6 +2272,8 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
|
||||||
struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
|
struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
|
||||||
int ext4_get_block_write(struct inode *inode, sector_t iblock,
|
int ext4_get_block_write(struct inode *inode, sector_t iblock,
|
||||||
struct buffer_head *bh_result, int create);
|
struct buffer_head *bh_result, int create);
|
||||||
|
int ext4_get_block_dax(struct inode *inode, sector_t iblock,
|
||||||
|
struct buffer_head *bh_result, int create);
|
||||||
int ext4_get_block(struct inode *inode, sector_t iblock,
|
int ext4_get_block(struct inode *inode, sector_t iblock,
|
||||||
struct buffer_head *bh_result, int create);
|
struct buffer_head *bh_result, int create);
|
||||||
int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
|
int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
|
||||||
|
|
|
@ -22,6 +22,7 @@
|
||||||
#include <linux/fs.h>
|
#include <linux/fs.h>
|
||||||
#include <linux/mount.h>
|
#include <linux/mount.h>
|
||||||
#include <linux/path.h>
|
#include <linux/path.h>
|
||||||
|
#include <linux/dax.h>
|
||||||
#include <linux/quotaops.h>
|
#include <linux/quotaops.h>
|
||||||
#include <linux/pagevec.h>
|
#include <linux/pagevec.h>
|
||||||
#include <linux/uio.h>
|
#include <linux/uio.h>
|
||||||
|
@ -195,7 +196,7 @@ out:
|
||||||
static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
|
static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
|
||||||
{
|
{
|
||||||
struct inode *inode = bh->b_assoc_map->host;
|
struct inode *inode = bh->b_assoc_map->host;
|
||||||
/* XXX: breaks on 32-bit > 16GB. Is that even supported? */
|
/* XXX: breaks on 32-bit > 16TB. Is that even supported? */
|
||||||
loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
|
loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
|
||||||
int err;
|
int err;
|
||||||
if (!uptodate)
|
if (!uptodate)
|
||||||
|
@ -206,17 +207,74 @@ static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
|
||||||
|
|
||||||
static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
|
static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
|
||||||
{
|
{
|
||||||
return dax_fault(vma, vmf, ext4_get_block, ext4_end_io_unwritten);
|
int result;
|
||||||
/* Is this the right get_block? */
|
handle_t *handle = NULL;
|
||||||
|
struct super_block *sb = file_inode(vma->vm_file)->i_sb;
|
||||||
|
bool write = vmf->flags & FAULT_FLAG_WRITE;
|
||||||
|
|
||||||
|
if (write) {
|
||||||
|
sb_start_pagefault(sb);
|
||||||
|
file_update_time(vma->vm_file);
|
||||||
|
handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
|
||||||
|
EXT4_DATA_TRANS_BLOCKS(sb));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (IS_ERR(handle))
|
||||||
|
result = VM_FAULT_SIGBUS;
|
||||||
|
else
|
||||||
|
result = __dax_fault(vma, vmf, ext4_get_block_dax,
|
||||||
|
ext4_end_io_unwritten);
|
||||||
|
|
||||||
|
if (write) {
|
||||||
|
if (!IS_ERR(handle))
|
||||||
|
ext4_journal_stop(handle);
|
||||||
|
sb_end_pagefault(sb);
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
|
||||||
|
pmd_t *pmd, unsigned int flags)
|
||||||
|
{
|
||||||
|
int result;
|
||||||
|
handle_t *handle = NULL;
|
||||||
|
struct inode *inode = file_inode(vma->vm_file);
|
||||||
|
struct super_block *sb = inode->i_sb;
|
||||||
|
bool write = flags & FAULT_FLAG_WRITE;
|
||||||
|
|
||||||
|
if (write) {
|
||||||
|
sb_start_pagefault(sb);
|
||||||
|
file_update_time(vma->vm_file);
|
||||||
|
handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
|
||||||
|
ext4_chunk_trans_blocks(inode,
|
||||||
|
PMD_SIZE / PAGE_SIZE));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (IS_ERR(handle))
|
||||||
|
result = VM_FAULT_SIGBUS;
|
||||||
|
else
|
||||||
|
result = __dax_pmd_fault(vma, addr, pmd, flags,
|
||||||
|
ext4_get_block_dax, ext4_end_io_unwritten);
|
||||||
|
|
||||||
|
if (write) {
|
||||||
|
if (!IS_ERR(handle))
|
||||||
|
ext4_journal_stop(handle);
|
||||||
|
sb_end_pagefault(sb);
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
|
static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
|
||||||
{
|
{
|
||||||
return dax_mkwrite(vma, vmf, ext4_get_block, ext4_end_io_unwritten);
|
return dax_mkwrite(vma, vmf, ext4_get_block_dax,
|
||||||
|
ext4_end_io_unwritten);
|
||||||
}
|
}
|
||||||
|
|
||||||
static const struct vm_operations_struct ext4_dax_vm_ops = {
|
static const struct vm_operations_struct ext4_dax_vm_ops = {
|
||||||
.fault = ext4_dax_fault,
|
.fault = ext4_dax_fault,
|
||||||
|
.pmd_fault = ext4_dax_pmd_fault,
|
||||||
.page_mkwrite = ext4_dax_mkwrite,
|
.page_mkwrite = ext4_dax_mkwrite,
|
||||||
.pfn_mkwrite = dax_pfn_mkwrite,
|
.pfn_mkwrite = dax_pfn_mkwrite,
|
||||||
};
|
};
|
||||||
|
@ -244,7 +302,7 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
|
||||||
file_accessed(file);
|
file_accessed(file);
|
||||||
if (IS_DAX(file_inode(file))) {
|
if (IS_DAX(file_inode(file))) {
|
||||||
vma->vm_ops = &ext4_dax_vm_ops;
|
vma->vm_ops = &ext4_dax_vm_ops;
|
||||||
vma->vm_flags |= VM_MIXEDMAP;
|
vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
|
||||||
} else {
|
} else {
|
||||||
vma->vm_ops = &ext4_file_vm_ops;
|
vma->vm_ops = &ext4_file_vm_ops;
|
||||||
}
|
}
|
||||||
|
|
|
@ -22,6 +22,7 @@
|
||||||
|
|
||||||
#include "ext4_jbd2.h"
|
#include "ext4_jbd2.h"
|
||||||
#include "truncate.h"
|
#include "truncate.h"
|
||||||
|
#include <linux/dax.h>
|
||||||
#include <linux/uio.h>
|
#include <linux/uio.h>
|
||||||
|
|
||||||
#include <trace/events/ext4.h>
|
#include <trace/events/ext4.h>
|
||||||
|
|
|
@ -22,6 +22,7 @@
|
||||||
#include <linux/time.h>
|
#include <linux/time.h>
|
||||||
#include <linux/highuid.h>
|
#include <linux/highuid.h>
|
||||||
#include <linux/pagemap.h>
|
#include <linux/pagemap.h>
|
||||||
|
#include <linux/dax.h>
|
||||||
#include <linux/quotaops.h>
|
#include <linux/quotaops.h>
|
||||||
#include <linux/string.h>
|
#include <linux/string.h>
|
||||||
#include <linux/buffer_head.h>
|
#include <linux/buffer_head.h>
|
||||||
|
@ -3020,6 +3021,17 @@ static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
|
||||||
EXT4_GET_BLOCKS_NO_LOCK);
|
EXT4_GET_BLOCKS_NO_LOCK);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int ext4_get_block_dax(struct inode *inode, sector_t iblock,
|
||||||
|
struct buffer_head *bh_result, int create)
|
||||||
|
{
|
||||||
|
int flags = EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_UNWRIT_EXT;
|
||||||
|
if (create)
|
||||||
|
flags |= EXT4_GET_BLOCKS_CREATE;
|
||||||
|
ext4_debug("ext4_get_block_dax: inode %lu, create flag %d\n",
|
||||||
|
inode->i_ino, create);
|
||||||
|
return _ext4_get_block(inode, iblock, bh_result, flags);
|
||||||
|
}
|
||||||
|
|
||||||
static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
|
static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
|
||||||
ssize_t size, void *private)
|
ssize_t size, void *private)
|
||||||
{
|
{
|
||||||
|
|
|
@ -12,6 +12,7 @@
|
||||||
#include <linux/thread_info.h>
|
#include <linux/thread_info.h>
|
||||||
#include <asm/current.h>
|
#include <asm/current.h>
|
||||||
#include <linux/sched.h> /* remove ASAP */
|
#include <linux/sched.h> /* remove ASAP */
|
||||||
|
#include <linux/falloc.h>
|
||||||
#include <linux/fs.h>
|
#include <linux/fs.h>
|
||||||
#include <linux/mount.h>
|
#include <linux/mount.h>
|
||||||
#include <linux/file.h>
|
#include <linux/file.h>
|
||||||
|
@ -84,6 +85,29 @@ static const match_table_t tokens = {
|
||||||
{Opt_err, NULL},
|
{Opt_err, NULL},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#ifdef CONFIG_NUMA
|
||||||
|
static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
|
||||||
|
struct inode *inode, pgoff_t index)
|
||||||
|
{
|
||||||
|
vma->vm_policy = mpol_shared_policy_lookup(&HUGETLBFS_I(inode)->policy,
|
||||||
|
index);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
|
||||||
|
{
|
||||||
|
mpol_cond_put(vma->vm_policy);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
|
||||||
|
struct inode *inode, pgoff_t index)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
static void huge_pagevec_release(struct pagevec *pvec)
|
static void huge_pagevec_release(struct pagevec *pvec)
|
||||||
{
|
{
|
||||||
int i;
|
int i;
|
||||||
|
@ -293,26 +317,61 @@ static int hugetlbfs_write_end(struct file *file, struct address_space *mapping,
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void truncate_huge_page(struct page *page)
|
static void remove_huge_page(struct page *page)
|
||||||
{
|
{
|
||||||
ClearPageDirty(page);
|
ClearPageDirty(page);
|
||||||
ClearPageUptodate(page);
|
ClearPageUptodate(page);
|
||||||
delete_from_page_cache(page);
|
delete_from_page_cache(page);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void truncate_hugepages(struct inode *inode, loff_t lstart)
|
|
||||||
|
/*
|
||||||
|
* remove_inode_hugepages handles two distinct cases: truncation and hole
|
||||||
|
* punch. There are subtle differences in operation for each case.
|
||||||
|
|
||||||
|
* truncation is indicated by end of range being LLONG_MAX
|
||||||
|
* In this case, we first scan the range and release found pages.
|
||||||
|
* After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
|
||||||
|
* maps and global counts.
|
||||||
|
* hole punch is indicated if end is not LLONG_MAX
|
||||||
|
* In the hole punch case we scan the range and release found pages.
|
||||||
|
* Only when releasing a page is the associated region/reserv map
|
||||||
|
* deleted. The region/reserv map for ranges without associated
|
||||||
|
* pages are not modified.
|
||||||
|
* Note: If the passed end of range value is beyond the end of file, but
|
||||||
|
* not LLONG_MAX this routine still performs a hole punch operation.
|
||||||
|
*/
|
||||||
|
static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
|
||||||
|
loff_t lend)
|
||||||
{
|
{
|
||||||
struct hstate *h = hstate_inode(inode);
|
struct hstate *h = hstate_inode(inode);
|
||||||
struct address_space *mapping = &inode->i_data;
|
struct address_space *mapping = &inode->i_data;
|
||||||
const pgoff_t start = lstart >> huge_page_shift(h);
|
const pgoff_t start = lstart >> huge_page_shift(h);
|
||||||
|
const pgoff_t end = lend >> huge_page_shift(h);
|
||||||
|
struct vm_area_struct pseudo_vma;
|
||||||
struct pagevec pvec;
|
struct pagevec pvec;
|
||||||
pgoff_t next;
|
pgoff_t next;
|
||||||
int i, freed = 0;
|
int i, freed = 0;
|
||||||
|
long lookup_nr = PAGEVEC_SIZE;
|
||||||
|
bool truncate_op = (lend == LLONG_MAX);
|
||||||
|
|
||||||
|
memset(&pseudo_vma, 0, sizeof(struct vm_area_struct));
|
||||||
|
pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
|
||||||
pagevec_init(&pvec, 0);
|
pagevec_init(&pvec, 0);
|
||||||
next = start;
|
next = start;
|
||||||
while (1) {
|
while (next < end) {
|
||||||
if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
|
/*
|
||||||
|
* Make sure to never grab more pages that we
|
||||||
|
* might possibly need.
|
||||||
|
*/
|
||||||
|
if (end - next < lookup_nr)
|
||||||
|
lookup_nr = end - next;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This pagevec_lookup() may return pages past 'end',
|
||||||
|
* so we must check for page->index > end.
|
||||||
|
*/
|
||||||
|
if (!pagevec_lookup(&pvec, mapping, next, lookup_nr)) {
|
||||||
if (next == start)
|
if (next == start)
|
||||||
break;
|
break;
|
||||||
next = start;
|
next = start;
|
||||||
|
@ -321,26 +380,69 @@ static void truncate_hugepages(struct inode *inode, loff_t lstart)
|
||||||
|
|
||||||
for (i = 0; i < pagevec_count(&pvec); ++i) {
|
for (i = 0; i < pagevec_count(&pvec); ++i) {
|
||||||
struct page *page = pvec.pages[i];
|
struct page *page = pvec.pages[i];
|
||||||
|
u32 hash;
|
||||||
|
|
||||||
|
hash = hugetlb_fault_mutex_hash(h, current->mm,
|
||||||
|
&pseudo_vma,
|
||||||
|
mapping, next, 0);
|
||||||
|
mutex_lock(&hugetlb_fault_mutex_table[hash]);
|
||||||
|
|
||||||
lock_page(page);
|
lock_page(page);
|
||||||
|
if (page->index >= end) {
|
||||||
|
unlock_page(page);
|
||||||
|
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
|
||||||
|
next = end; /* we are done */
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If page is mapped, it was faulted in after being
|
||||||
|
* unmapped. Do nothing in this race case. In the
|
||||||
|
* normal case page is not mapped.
|
||||||
|
*/
|
||||||
|
if (!page_mapped(page)) {
|
||||||
|
bool rsv_on_error = !PagePrivate(page);
|
||||||
|
/*
|
||||||
|
* We must free the huge page and remove
|
||||||
|
* from page cache (remove_huge_page) BEFORE
|
||||||
|
* removing the region/reserve map
|
||||||
|
* (hugetlb_unreserve_pages). In rare out
|
||||||
|
* of memory conditions, removal of the
|
||||||
|
* region/reserve map could fail. Before
|
||||||
|
* free'ing the page, note PagePrivate which
|
||||||
|
* is used in case of error.
|
||||||
|
*/
|
||||||
|
remove_huge_page(page);
|
||||||
|
freed++;
|
||||||
|
if (!truncate_op) {
|
||||||
|
if (unlikely(hugetlb_unreserve_pages(
|
||||||
|
inode, next,
|
||||||
|
next + 1, 1)))
|
||||||
|
hugetlb_fix_reserve_counts(
|
||||||
|
inode, rsv_on_error);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (page->index > next)
|
if (page->index > next)
|
||||||
next = page->index;
|
next = page->index;
|
||||||
|
|
||||||
++next;
|
++next;
|
||||||
truncate_huge_page(page);
|
|
||||||
unlock_page(page);
|
unlock_page(page);
|
||||||
freed++;
|
|
||||||
|
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
|
||||||
}
|
}
|
||||||
huge_pagevec_release(&pvec);
|
huge_pagevec_release(&pvec);
|
||||||
}
|
}
|
||||||
BUG_ON(!lstart && mapping->nrpages);
|
|
||||||
hugetlb_unreserve_pages(inode, start, freed);
|
if (truncate_op)
|
||||||
|
(void)hugetlb_unreserve_pages(inode, start, LONG_MAX, freed);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void hugetlbfs_evict_inode(struct inode *inode)
|
static void hugetlbfs_evict_inode(struct inode *inode)
|
||||||
{
|
{
|
||||||
struct resv_map *resv_map;
|
struct resv_map *resv_map;
|
||||||
|
|
||||||
truncate_hugepages(inode, 0);
|
remove_inode_hugepages(inode, 0, LLONG_MAX);
|
||||||
resv_map = (struct resv_map *)inode->i_mapping->private_data;
|
resv_map = (struct resv_map *)inode->i_mapping->private_data;
|
||||||
/* root inode doesn't have the resv_map, so we should check it */
|
/* root inode doesn't have the resv_map, so we should check it */
|
||||||
if (resv_map)
|
if (resv_map)
|
||||||
|
@ -349,11 +451,15 @@ static void hugetlbfs_evict_inode(struct inode *inode)
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void
|
static inline void
|
||||||
hugetlb_vmtruncate_list(struct rb_root *root, pgoff_t pgoff)
|
hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
|
||||||
{
|
{
|
||||||
struct vm_area_struct *vma;
|
struct vm_area_struct *vma;
|
||||||
|
|
||||||
vma_interval_tree_foreach(vma, root, pgoff, ULONG_MAX) {
|
/*
|
||||||
|
* end == 0 indicates that the entire range after
|
||||||
|
* start should be unmapped.
|
||||||
|
*/
|
||||||
|
vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
|
||||||
unsigned long v_offset;
|
unsigned long v_offset;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -362,13 +468,20 @@ hugetlb_vmtruncate_list(struct rb_root *root, pgoff_t pgoff)
|
||||||
* which overlap the truncated area starting at pgoff,
|
* which overlap the truncated area starting at pgoff,
|
||||||
* and no vma on a 32-bit arch can span beyond the 4GB.
|
* and no vma on a 32-bit arch can span beyond the 4GB.
|
||||||
*/
|
*/
|
||||||
if (vma->vm_pgoff < pgoff)
|
if (vma->vm_pgoff < start)
|
||||||
v_offset = (pgoff - vma->vm_pgoff) << PAGE_SHIFT;
|
v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
|
||||||
else
|
else
|
||||||
v_offset = 0;
|
v_offset = 0;
|
||||||
|
|
||||||
unmap_hugepage_range(vma, vma->vm_start + v_offset,
|
if (end) {
|
||||||
vma->vm_end, NULL);
|
end = ((end - start) << PAGE_SHIFT) +
|
||||||
|
vma->vm_start + v_offset;
|
||||||
|
if (end > vma->vm_end)
|
||||||
|
end = vma->vm_end;
|
||||||
|
} else
|
||||||
|
end = vma->vm_end;
|
||||||
|
|
||||||
|
unmap_hugepage_range(vma, vma->vm_start + v_offset, end, NULL);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -384,12 +497,164 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
|
||||||
i_size_write(inode, offset);
|
i_size_write(inode, offset);
|
||||||
i_mmap_lock_write(mapping);
|
i_mmap_lock_write(mapping);
|
||||||
if (!RB_EMPTY_ROOT(&mapping->i_mmap))
|
if (!RB_EMPTY_ROOT(&mapping->i_mmap))
|
||||||
hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
|
hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0);
|
||||||
i_mmap_unlock_write(mapping);
|
i_mmap_unlock_write(mapping);
|
||||||
truncate_hugepages(inode, offset);
|
remove_inode_hugepages(inode, offset, LLONG_MAX);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
|
||||||
|
{
|
||||||
|
struct hstate *h = hstate_inode(inode);
|
||||||
|
loff_t hpage_size = huge_page_size(h);
|
||||||
|
loff_t hole_start, hole_end;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* For hole punch round up the beginning offset of the hole and
|
||||||
|
* round down the end.
|
||||||
|
*/
|
||||||
|
hole_start = round_up(offset, hpage_size);
|
||||||
|
hole_end = round_down(offset + len, hpage_size);
|
||||||
|
|
||||||
|
if (hole_end > hole_start) {
|
||||||
|
struct address_space *mapping = inode->i_mapping;
|
||||||
|
|
||||||
|
mutex_lock(&inode->i_mutex);
|
||||||
|
i_mmap_lock_write(mapping);
|
||||||
|
if (!RB_EMPTY_ROOT(&mapping->i_mmap))
|
||||||
|
hugetlb_vmdelete_list(&mapping->i_mmap,
|
||||||
|
hole_start >> PAGE_SHIFT,
|
||||||
|
hole_end >> PAGE_SHIFT);
|
||||||
|
i_mmap_unlock_write(mapping);
|
||||||
|
remove_inode_hugepages(inode, hole_start, hole_end);
|
||||||
|
mutex_unlock(&inode->i_mutex);
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
|
||||||
|
loff_t len)
|
||||||
|
{
|
||||||
|
struct inode *inode = file_inode(file);
|
||||||
|
struct address_space *mapping = inode->i_mapping;
|
||||||
|
struct hstate *h = hstate_inode(inode);
|
||||||
|
struct vm_area_struct pseudo_vma;
|
||||||
|
struct mm_struct *mm = current->mm;
|
||||||
|
loff_t hpage_size = huge_page_size(h);
|
||||||
|
unsigned long hpage_shift = huge_page_shift(h);
|
||||||
|
pgoff_t start, index, end;
|
||||||
|
int error;
|
||||||
|
u32 hash;
|
||||||
|
|
||||||
|
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
|
||||||
|
return -EOPNOTSUPP;
|
||||||
|
|
||||||
|
if (mode & FALLOC_FL_PUNCH_HOLE)
|
||||||
|
return hugetlbfs_punch_hole(inode, offset, len);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Default preallocate case.
|
||||||
|
* For this range, start is rounded down and end is rounded up
|
||||||
|
* as well as being converted to page offsets.
|
||||||
|
*/
|
||||||
|
start = offset >> hpage_shift;
|
||||||
|
end = (offset + len + hpage_size - 1) >> hpage_shift;
|
||||||
|
|
||||||
|
mutex_lock(&inode->i_mutex);
|
||||||
|
|
||||||
|
/* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
|
||||||
|
error = inode_newsize_ok(inode, offset + len);
|
||||||
|
if (error)
|
||||||
|
goto out;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Initialize a pseudo vma as this is required by the huge page
|
||||||
|
* allocation routines. If NUMA is configured, use page index
|
||||||
|
* as input to create an allocation policy.
|
||||||
|
*/
|
||||||
|
memset(&pseudo_vma, 0, sizeof(struct vm_area_struct));
|
||||||
|
pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
|
||||||
|
pseudo_vma.vm_file = file;
|
||||||
|
|
||||||
|
for (index = start; index < end; index++) {
|
||||||
|
/*
|
||||||
|
* This is supposed to be the vaddr where the page is being
|
||||||
|
* faulted in, but we have no vaddr here.
|
||||||
|
*/
|
||||||
|
struct page *page;
|
||||||
|
unsigned long addr;
|
||||||
|
int avoid_reserve = 0;
|
||||||
|
|
||||||
|
cond_resched();
|
||||||
|
|
||||||
|
/*
|
||||||
|
* fallocate(2) manpage permits EINTR; we may have been
|
||||||
|
* interrupted because we are using up too much memory.
|
||||||
|
*/
|
||||||
|
if (signal_pending(current)) {
|
||||||
|
error = -EINTR;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Set numa allocation policy based on index */
|
||||||
|
hugetlb_set_vma_policy(&pseudo_vma, inode, index);
|
||||||
|
|
||||||
|
/* addr is the offset within the file (zero based) */
|
||||||
|
addr = index * hpage_size;
|
||||||
|
|
||||||
|
/* mutex taken here, fault path and hole punch */
|
||||||
|
hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping,
|
||||||
|
index, addr);
|
||||||
|
mutex_lock(&hugetlb_fault_mutex_table[hash]);
|
||||||
|
|
||||||
|
/* See if already present in mapping to avoid alloc/free */
|
||||||
|
page = find_get_page(mapping, index);
|
||||||
|
if (page) {
|
||||||
|
put_page(page);
|
||||||
|
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
|
||||||
|
hugetlb_drop_vma_policy(&pseudo_vma);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Allocate page and add to page cache */
|
||||||
|
page = alloc_huge_page(&pseudo_vma, addr, avoid_reserve);
|
||||||
|
hugetlb_drop_vma_policy(&pseudo_vma);
|
||||||
|
if (IS_ERR(page)) {
|
||||||
|
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
|
||||||
|
error = PTR_ERR(page);
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
clear_huge_page(page, addr, pages_per_huge_page(h));
|
||||||
|
__SetPageUptodate(page);
|
||||||
|
error = huge_add_to_page_cache(page, mapping, index);
|
||||||
|
if (unlikely(error)) {
|
||||||
|
put_page(page);
|
||||||
|
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
|
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* page_put due to reference from alloc_huge_page()
|
||||||
|
* unlock_page because locked by add_to_page_cache()
|
||||||
|
*/
|
||||||
|
put_page(page);
|
||||||
|
unlock_page(page);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
|
||||||
|
i_size_write(inode, offset + len);
|
||||||
|
inode->i_ctime = CURRENT_TIME;
|
||||||
|
spin_lock(&inode->i_lock);
|
||||||
|
inode->i_private = NULL;
|
||||||
|
spin_unlock(&inode->i_lock);
|
||||||
|
out:
|
||||||
|
mutex_unlock(&inode->i_mutex);
|
||||||
|
return error;
|
||||||
|
}
|
||||||
|
|
||||||
static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
|
static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
|
||||||
{
|
{
|
||||||
struct inode *inode = d_inode(dentry);
|
struct inode *inode = d_inode(dentry);
|
||||||
|
@ -701,7 +966,8 @@ const struct file_operations hugetlbfs_file_operations = {
|
||||||
.mmap = hugetlbfs_file_mmap,
|
.mmap = hugetlbfs_file_mmap,
|
||||||
.fsync = noop_fsync,
|
.fsync = noop_fsync,
|
||||||
.get_unmapped_area = hugetlb_get_unmapped_area,
|
.get_unmapped_area = hugetlb_get_unmapped_area,
|
||||||
.llseek = default_llseek,
|
.llseek = default_llseek,
|
||||||
|
.fallocate = hugetlbfs_fallocate,
|
||||||
};
|
};
|
||||||
|
|
||||||
static const struct inode_operations hugetlbfs_dir_inode_operations = {
|
static const struct inode_operations hugetlbfs_dir_inode_operations = {
|
||||||
|
|
|
@ -446,6 +446,7 @@ struct mem_size_stats {
|
||||||
unsigned long anonymous_thp;
|
unsigned long anonymous_thp;
|
||||||
unsigned long swap;
|
unsigned long swap;
|
||||||
u64 pss;
|
u64 pss;
|
||||||
|
u64 swap_pss;
|
||||||
};
|
};
|
||||||
|
|
||||||
static void smaps_account(struct mem_size_stats *mss, struct page *page,
|
static void smaps_account(struct mem_size_stats *mss, struct page *page,
|
||||||
|
@ -492,9 +493,20 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
|
||||||
} else if (is_swap_pte(*pte)) {
|
} else if (is_swap_pte(*pte)) {
|
||||||
swp_entry_t swpent = pte_to_swp_entry(*pte);
|
swp_entry_t swpent = pte_to_swp_entry(*pte);
|
||||||
|
|
||||||
if (!non_swap_entry(swpent))
|
if (!non_swap_entry(swpent)) {
|
||||||
|
int mapcount;
|
||||||
|
|
||||||
mss->swap += PAGE_SIZE;
|
mss->swap += PAGE_SIZE;
|
||||||
else if (is_migration_entry(swpent))
|
mapcount = swp_swapcount(swpent);
|
||||||
|
if (mapcount >= 2) {
|
||||||
|
u64 pss_delta = (u64)PAGE_SIZE << PSS_SHIFT;
|
||||||
|
|
||||||
|
do_div(pss_delta, mapcount);
|
||||||
|
mss->swap_pss += pss_delta;
|
||||||
|
} else {
|
||||||
|
mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT;
|
||||||
|
}
|
||||||
|
} else if (is_migration_entry(swpent))
|
||||||
page = migration_entry_to_page(swpent);
|
page = migration_entry_to_page(swpent);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -640,6 +652,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
|
||||||
"Anonymous: %8lu kB\n"
|
"Anonymous: %8lu kB\n"
|
||||||
"AnonHugePages: %8lu kB\n"
|
"AnonHugePages: %8lu kB\n"
|
||||||
"Swap: %8lu kB\n"
|
"Swap: %8lu kB\n"
|
||||||
|
"SwapPss: %8lu kB\n"
|
||||||
"KernelPageSize: %8lu kB\n"
|
"KernelPageSize: %8lu kB\n"
|
||||||
"MMUPageSize: %8lu kB\n"
|
"MMUPageSize: %8lu kB\n"
|
||||||
"Locked: %8lu kB\n",
|
"Locked: %8lu kB\n",
|
||||||
|
@ -654,6 +667,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
|
||||||
mss.anonymous >> 10,
|
mss.anonymous >> 10,
|
||||||
mss.anonymous_thp >> 10,
|
mss.anonymous_thp >> 10,
|
||||||
mss.swap >> 10,
|
mss.swap >> 10,
|
||||||
|
(unsigned long)(mss.swap_pss >> (10 + PSS_SHIFT)),
|
||||||
vma_kernel_pagesize(vma) >> 10,
|
vma_kernel_pagesize(vma) >> 10,
|
||||||
vma_mmu_pagesize(vma) >> 10,
|
vma_mmu_pagesize(vma) >> 10,
|
||||||
(vma->vm_flags & VM_LOCKED) ?
|
(vma->vm_flags & VM_LOCKED) ?
|
||||||
|
@ -712,23 +726,6 @@ const struct file_operations proc_tid_smaps_operations = {
|
||||||
.release = proc_map_release,
|
.release = proc_map_release,
|
||||||
};
|
};
|
||||||
|
|
||||||
/*
|
|
||||||
* We do not want to have constant page-shift bits sitting in
|
|
||||||
* pagemap entries and are about to reuse them some time soon.
|
|
||||||
*
|
|
||||||
* Here's the "migration strategy":
|
|
||||||
* 1. when the system boots these bits remain what they are,
|
|
||||||
* but a warning about future change is printed in log;
|
|
||||||
* 2. once anyone clears soft-dirty bits via clear_refs file,
|
|
||||||
* these flag is set to denote, that user is aware of the
|
|
||||||
* new API and those page-shift bits change their meaning.
|
|
||||||
* The respective warning is printed in dmesg;
|
|
||||||
* 3. In a couple of releases we will remove all the mentions
|
|
||||||
* of page-shift in pagemap entries.
|
|
||||||
*/
|
|
||||||
|
|
||||||
static bool soft_dirty_cleared __read_mostly;
|
|
||||||
|
|
||||||
enum clear_refs_types {
|
enum clear_refs_types {
|
||||||
CLEAR_REFS_ALL = 1,
|
CLEAR_REFS_ALL = 1,
|
||||||
CLEAR_REFS_ANON,
|
CLEAR_REFS_ANON,
|
||||||
|
@ -889,13 +886,6 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
|
||||||
if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST)
|
if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST)
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
|
|
||||||
if (type == CLEAR_REFS_SOFT_DIRTY) {
|
|
||||||
soft_dirty_cleared = true;
|
|
||||||
pr_warn_once("The pagemap bits 55-60 has changed their meaning!"
|
|
||||||
" See the linux/Documentation/vm/pagemap.txt for "
|
|
||||||
"details.\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
task = get_proc_task(file_inode(file));
|
task = get_proc_task(file_inode(file));
|
||||||
if (!task)
|
if (!task)
|
||||||
return -ESRCH;
|
return -ESRCH;
|
||||||
|
@ -963,36 +953,26 @@ typedef struct {
|
||||||
struct pagemapread {
|
struct pagemapread {
|
||||||
int pos, len; /* units: PM_ENTRY_BYTES, not bytes */
|
int pos, len; /* units: PM_ENTRY_BYTES, not bytes */
|
||||||
pagemap_entry_t *buffer;
|
pagemap_entry_t *buffer;
|
||||||
bool v2;
|
bool show_pfn;
|
||||||
};
|
};
|
||||||
|
|
||||||
#define PAGEMAP_WALK_SIZE (PMD_SIZE)
|
#define PAGEMAP_WALK_SIZE (PMD_SIZE)
|
||||||
#define PAGEMAP_WALK_MASK (PMD_MASK)
|
#define PAGEMAP_WALK_MASK (PMD_MASK)
|
||||||
|
|
||||||
#define PM_ENTRY_BYTES sizeof(pagemap_entry_t)
|
#define PM_ENTRY_BYTES sizeof(pagemap_entry_t)
|
||||||
#define PM_STATUS_BITS 3
|
#define PM_PFRAME_BITS 55
|
||||||
#define PM_STATUS_OFFSET (64 - PM_STATUS_BITS)
|
#define PM_PFRAME_MASK GENMASK_ULL(PM_PFRAME_BITS - 1, 0)
|
||||||
#define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET)
|
#define PM_SOFT_DIRTY BIT_ULL(55)
|
||||||
#define PM_STATUS(nr) (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK)
|
#define PM_MMAP_EXCLUSIVE BIT_ULL(56)
|
||||||
#define PM_PSHIFT_BITS 6
|
#define PM_FILE BIT_ULL(61)
|
||||||
#define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS)
|
#define PM_SWAP BIT_ULL(62)
|
||||||
#define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET)
|
#define PM_PRESENT BIT_ULL(63)
|
||||||
#define __PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
|
|
||||||
#define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1)
|
|
||||||
#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK)
|
|
||||||
/* in "new" pagemap pshift bits are occupied with more status bits */
|
|
||||||
#define PM_STATUS2(v2, x) (__PM_PSHIFT(v2 ? x : PAGE_SHIFT))
|
|
||||||
|
|
||||||
#define __PM_SOFT_DIRTY (1LL)
|
|
||||||
#define PM_PRESENT PM_STATUS(4LL)
|
|
||||||
#define PM_SWAP PM_STATUS(2LL)
|
|
||||||
#define PM_FILE PM_STATUS(1LL)
|
|
||||||
#define PM_NOT_PRESENT(v2) PM_STATUS2(v2, 0)
|
|
||||||
#define PM_END_OF_BUFFER 1
|
#define PM_END_OF_BUFFER 1
|
||||||
|
|
||||||
static inline pagemap_entry_t make_pme(u64 val)
|
static inline pagemap_entry_t make_pme(u64 frame, u64 flags)
|
||||||
{
|
{
|
||||||
return (pagemap_entry_t) { .pme = val };
|
return (pagemap_entry_t) { .pme = (frame & PM_PFRAME_MASK) | flags };
|
||||||
}
|
}
|
||||||
|
|
||||||
static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme,
|
static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme,
|
||||||
|
@ -1013,7 +993,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
|
||||||
|
|
||||||
while (addr < end) {
|
while (addr < end) {
|
||||||
struct vm_area_struct *vma = find_vma(walk->mm, addr);
|
struct vm_area_struct *vma = find_vma(walk->mm, addr);
|
||||||
pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
|
pagemap_entry_t pme = make_pme(0, 0);
|
||||||
/* End of address space hole, which we mark as non-present. */
|
/* End of address space hole, which we mark as non-present. */
|
||||||
unsigned long hole_end;
|
unsigned long hole_end;
|
||||||
|
|
||||||
|
@ -1033,7 +1013,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
|
||||||
|
|
||||||
/* Addresses in the VMA. */
|
/* Addresses in the VMA. */
|
||||||
if (vma->vm_flags & VM_SOFTDIRTY)
|
if (vma->vm_flags & VM_SOFTDIRTY)
|
||||||
pme.pme |= PM_STATUS2(pm->v2, __PM_SOFT_DIRTY);
|
pme = make_pme(0, PM_SOFT_DIRTY);
|
||||||
for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
|
for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
|
||||||
err = add_to_pagemap(addr, &pme, pm);
|
err = add_to_pagemap(addr, &pme, pm);
|
||||||
if (err)
|
if (err)
|
||||||
|
@ -1044,67 +1024,42 @@ out:
|
||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
|
static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
|
||||||
struct vm_area_struct *vma, unsigned long addr, pte_t pte)
|
struct vm_area_struct *vma, unsigned long addr, pte_t pte)
|
||||||
{
|
{
|
||||||
u64 frame, flags;
|
u64 frame = 0, flags = 0;
|
||||||
struct page *page = NULL;
|
struct page *page = NULL;
|
||||||
int flags2 = 0;
|
|
||||||
|
|
||||||
if (pte_present(pte)) {
|
if (pte_present(pte)) {
|
||||||
frame = pte_pfn(pte);
|
if (pm->show_pfn)
|
||||||
flags = PM_PRESENT;
|
frame = pte_pfn(pte);
|
||||||
|
flags |= PM_PRESENT;
|
||||||
page = vm_normal_page(vma, addr, pte);
|
page = vm_normal_page(vma, addr, pte);
|
||||||
if (pte_soft_dirty(pte))
|
if (pte_soft_dirty(pte))
|
||||||
flags2 |= __PM_SOFT_DIRTY;
|
flags |= PM_SOFT_DIRTY;
|
||||||
} else if (is_swap_pte(pte)) {
|
} else if (is_swap_pte(pte)) {
|
||||||
swp_entry_t entry;
|
swp_entry_t entry;
|
||||||
if (pte_swp_soft_dirty(pte))
|
if (pte_swp_soft_dirty(pte))
|
||||||
flags2 |= __PM_SOFT_DIRTY;
|
flags |= PM_SOFT_DIRTY;
|
||||||
entry = pte_to_swp_entry(pte);
|
entry = pte_to_swp_entry(pte);
|
||||||
frame = swp_type(entry) |
|
frame = swp_type(entry) |
|
||||||
(swp_offset(entry) << MAX_SWAPFILES_SHIFT);
|
(swp_offset(entry) << MAX_SWAPFILES_SHIFT);
|
||||||
flags = PM_SWAP;
|
flags |= PM_SWAP;
|
||||||
if (is_migration_entry(entry))
|
if (is_migration_entry(entry))
|
||||||
page = migration_entry_to_page(entry);
|
page = migration_entry_to_page(entry);
|
||||||
} else {
|
|
||||||
if (vma->vm_flags & VM_SOFTDIRTY)
|
|
||||||
flags2 |= __PM_SOFT_DIRTY;
|
|
||||||
*pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2));
|
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (page && !PageAnon(page))
|
if (page && !PageAnon(page))
|
||||||
flags |= PM_FILE;
|
flags |= PM_FILE;
|
||||||
if ((vma->vm_flags & VM_SOFTDIRTY))
|
if (page && page_mapcount(page) == 1)
|
||||||
flags2 |= __PM_SOFT_DIRTY;
|
flags |= PM_MMAP_EXCLUSIVE;
|
||||||
|
if (vma->vm_flags & VM_SOFTDIRTY)
|
||||||
|
flags |= PM_SOFT_DIRTY;
|
||||||
|
|
||||||
*pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags);
|
return make_pme(frame, flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
|
||||||
static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
|
|
||||||
pmd_t pmd, int offset, int pmd_flags2)
|
|
||||||
{
|
|
||||||
/*
|
|
||||||
* Currently pmd for thp is always present because thp can not be
|
|
||||||
* swapped-out, migrated, or HWPOISONed (split in such cases instead.)
|
|
||||||
* This if-check is just to prepare for future implementation.
|
|
||||||
*/
|
|
||||||
if (pmd_present(pmd))
|
|
||||||
*pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset)
|
|
||||||
| PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT);
|
|
||||||
else
|
|
||||||
*pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, pmd_flags2));
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
|
|
||||||
pmd_t pmd, int offset, int pmd_flags2)
|
|
||||||
{
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
|
|
||||||
struct mm_walk *walk)
|
struct mm_walk *walk)
|
||||||
{
|
{
|
||||||
struct vm_area_struct *vma = walk->vma;
|
struct vm_area_struct *vma = walk->vma;
|
||||||
|
@ -1113,41 +1068,58 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
|
||||||
pte_t *pte, *orig_pte;
|
pte_t *pte, *orig_pte;
|
||||||
int err = 0;
|
int err = 0;
|
||||||
|
|
||||||
if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
|
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||||
int pmd_flags2;
|
if (pmd_trans_huge_lock(pmdp, vma, &ptl) == 1) {
|
||||||
|
u64 flags = 0, frame = 0;
|
||||||
|
pmd_t pmd = *pmdp;
|
||||||
|
|
||||||
if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd))
|
if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(pmd))
|
||||||
pmd_flags2 = __PM_SOFT_DIRTY;
|
flags |= PM_SOFT_DIRTY;
|
||||||
else
|
|
||||||
pmd_flags2 = 0;
|
/*
|
||||||
|
* Currently pmd for thp is always present because thp
|
||||||
|
* can not be swapped-out, migrated, or HWPOISONed
|
||||||
|
* (split in such cases instead.)
|
||||||
|
* This if-check is just to prepare for future implementation.
|
||||||
|
*/
|
||||||
|
if (pmd_present(pmd)) {
|
||||||
|
struct page *page = pmd_page(pmd);
|
||||||
|
|
||||||
|
if (page_mapcount(page) == 1)
|
||||||
|
flags |= PM_MMAP_EXCLUSIVE;
|
||||||
|
|
||||||
|
flags |= PM_PRESENT;
|
||||||
|
if (pm->show_pfn)
|
||||||
|
frame = pmd_pfn(pmd) +
|
||||||
|
((addr & ~PMD_MASK) >> PAGE_SHIFT);
|
||||||
|
}
|
||||||
|
|
||||||
for (; addr != end; addr += PAGE_SIZE) {
|
for (; addr != end; addr += PAGE_SIZE) {
|
||||||
unsigned long offset;
|
pagemap_entry_t pme = make_pme(frame, flags);
|
||||||
pagemap_entry_t pme;
|
|
||||||
|
|
||||||
offset = (addr & ~PAGEMAP_WALK_MASK) >>
|
|
||||||
PAGE_SHIFT;
|
|
||||||
thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset, pmd_flags2);
|
|
||||||
err = add_to_pagemap(addr, &pme, pm);
|
err = add_to_pagemap(addr, &pme, pm);
|
||||||
if (err)
|
if (err)
|
||||||
break;
|
break;
|
||||||
|
if (pm->show_pfn && (flags & PM_PRESENT))
|
||||||
|
frame++;
|
||||||
}
|
}
|
||||||
spin_unlock(ptl);
|
spin_unlock(ptl);
|
||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (pmd_trans_unstable(pmd))
|
if (pmd_trans_unstable(pmdp))
|
||||||
return 0;
|
return 0;
|
||||||
|
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We can assume that @vma always points to a valid one and @end never
|
* We can assume that @vma always points to a valid one and @end never
|
||||||
* goes beyond vma->vm_end.
|
* goes beyond vma->vm_end.
|
||||||
*/
|
*/
|
||||||
orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
|
orig_pte = pte = pte_offset_map_lock(walk->mm, pmdp, addr, &ptl);
|
||||||
for (; addr < end; pte++, addr += PAGE_SIZE) {
|
for (; addr < end; pte++, addr += PAGE_SIZE) {
|
||||||
pagemap_entry_t pme;
|
pagemap_entry_t pme;
|
||||||
|
|
||||||
pte_to_pagemap_entry(&pme, pm, vma, addr, *pte);
|
pme = pte_to_pagemap_entry(pm, vma, addr, *pte);
|
||||||
err = add_to_pagemap(addr, &pme, pm);
|
err = add_to_pagemap(addr, &pme, pm);
|
||||||
if (err)
|
if (err)
|
||||||
break;
|
break;
|
||||||
|
@ -1160,40 +1132,44 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef CONFIG_HUGETLB_PAGE
|
#ifdef CONFIG_HUGETLB_PAGE
|
||||||
static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
|
|
||||||
pte_t pte, int offset, int flags2)
|
|
||||||
{
|
|
||||||
if (pte_present(pte))
|
|
||||||
*pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) |
|
|
||||||
PM_STATUS2(pm->v2, flags2) |
|
|
||||||
PM_PRESENT);
|
|
||||||
else
|
|
||||||
*pme = make_pme(PM_NOT_PRESENT(pm->v2) |
|
|
||||||
PM_STATUS2(pm->v2, flags2));
|
|
||||||
}
|
|
||||||
|
|
||||||
/* This function walks within one hugetlb entry in the single call */
|
/* This function walks within one hugetlb entry in the single call */
|
||||||
static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
|
static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
|
||||||
unsigned long addr, unsigned long end,
|
unsigned long addr, unsigned long end,
|
||||||
struct mm_walk *walk)
|
struct mm_walk *walk)
|
||||||
{
|
{
|
||||||
struct pagemapread *pm = walk->private;
|
struct pagemapread *pm = walk->private;
|
||||||
struct vm_area_struct *vma = walk->vma;
|
struct vm_area_struct *vma = walk->vma;
|
||||||
|
u64 flags = 0, frame = 0;
|
||||||
int err = 0;
|
int err = 0;
|
||||||
int flags2;
|
pte_t pte;
|
||||||
pagemap_entry_t pme;
|
|
||||||
|
|
||||||
if (vma->vm_flags & VM_SOFTDIRTY)
|
if (vma->vm_flags & VM_SOFTDIRTY)
|
||||||
flags2 = __PM_SOFT_DIRTY;
|
flags |= PM_SOFT_DIRTY;
|
||||||
else
|
|
||||||
flags2 = 0;
|
pte = huge_ptep_get(ptep);
|
||||||
|
if (pte_present(pte)) {
|
||||||
|
struct page *page = pte_page(pte);
|
||||||
|
|
||||||
|
if (!PageAnon(page))
|
||||||
|
flags |= PM_FILE;
|
||||||
|
|
||||||
|
if (page_mapcount(page) == 1)
|
||||||
|
flags |= PM_MMAP_EXCLUSIVE;
|
||||||
|
|
||||||
|
flags |= PM_PRESENT;
|
||||||
|
if (pm->show_pfn)
|
||||||
|
frame = pte_pfn(pte) +
|
||||||
|
((addr & ~hmask) >> PAGE_SHIFT);
|
||||||
|
}
|
||||||
|
|
||||||
for (; addr != end; addr += PAGE_SIZE) {
|
for (; addr != end; addr += PAGE_SIZE) {
|
||||||
int offset = (addr & ~hmask) >> PAGE_SHIFT;
|
pagemap_entry_t pme = make_pme(frame, flags);
|
||||||
huge_pte_to_pagemap_entry(&pme, pm, *pte, offset, flags2);
|
|
||||||
err = add_to_pagemap(addr, &pme, pm);
|
err = add_to_pagemap(addr, &pme, pm);
|
||||||
if (err)
|
if (err)
|
||||||
return err;
|
return err;
|
||||||
|
if (pm->show_pfn && (flags & PM_PRESENT))
|
||||||
|
frame++;
|
||||||
}
|
}
|
||||||
|
|
||||||
cond_resched();
|
cond_resched();
|
||||||
|
@ -1211,7 +1187,9 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
|
||||||
* Bits 0-54 page frame number (PFN) if present
|
* Bits 0-54 page frame number (PFN) if present
|
||||||
* Bits 0-4 swap type if swapped
|
* Bits 0-4 swap type if swapped
|
||||||
* Bits 5-54 swap offset if swapped
|
* Bits 5-54 swap offset if swapped
|
||||||
* Bits 55-60 page shift (page size = 1<<page shift)
|
* Bit 55 pte is soft-dirty (see Documentation/vm/soft-dirty.txt)
|
||||||
|
* Bit 56 page exclusively mapped
|
||||||
|
* Bits 57-60 zero
|
||||||
* Bit 61 page is file-page or shared-anon
|
* Bit 61 page is file-page or shared-anon
|
||||||
* Bit 62 page swapped
|
* Bit 62 page swapped
|
||||||
* Bit 63 page present
|
* Bit 63 page present
|
||||||
|
@ -1229,42 +1207,37 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
|
||||||
static ssize_t pagemap_read(struct file *file, char __user *buf,
|
static ssize_t pagemap_read(struct file *file, char __user *buf,
|
||||||
size_t count, loff_t *ppos)
|
size_t count, loff_t *ppos)
|
||||||
{
|
{
|
||||||
struct task_struct *task = get_proc_task(file_inode(file));
|
struct mm_struct *mm = file->private_data;
|
||||||
struct mm_struct *mm;
|
|
||||||
struct pagemapread pm;
|
struct pagemapread pm;
|
||||||
int ret = -ESRCH;
|
|
||||||
struct mm_walk pagemap_walk = {};
|
struct mm_walk pagemap_walk = {};
|
||||||
unsigned long src;
|
unsigned long src;
|
||||||
unsigned long svpfn;
|
unsigned long svpfn;
|
||||||
unsigned long start_vaddr;
|
unsigned long start_vaddr;
|
||||||
unsigned long end_vaddr;
|
unsigned long end_vaddr;
|
||||||
int copied = 0;
|
int ret = 0, copied = 0;
|
||||||
|
|
||||||
if (!task)
|
if (!mm || !atomic_inc_not_zero(&mm->mm_users))
|
||||||
goto out;
|
goto out;
|
||||||
|
|
||||||
ret = -EINVAL;
|
ret = -EINVAL;
|
||||||
/* file position must be aligned */
|
/* file position must be aligned */
|
||||||
if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES))
|
if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES))
|
||||||
goto out_task;
|
goto out_mm;
|
||||||
|
|
||||||
ret = 0;
|
ret = 0;
|
||||||
if (!count)
|
if (!count)
|
||||||
goto out_task;
|
goto out_mm;
|
||||||
|
|
||||||
|
/* do not disclose physical addresses: attack vector */
|
||||||
|
pm.show_pfn = file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN);
|
||||||
|
|
||||||
pm.v2 = soft_dirty_cleared;
|
|
||||||
pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
|
pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
|
||||||
pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_TEMPORARY);
|
pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_TEMPORARY);
|
||||||
ret = -ENOMEM;
|
ret = -ENOMEM;
|
||||||
if (!pm.buffer)
|
if (!pm.buffer)
|
||||||
goto out_task;
|
goto out_mm;
|
||||||
|
|
||||||
mm = mm_access(task, PTRACE_MODE_READ);
|
pagemap_walk.pmd_entry = pagemap_pmd_range;
|
||||||
ret = PTR_ERR(mm);
|
|
||||||
if (!mm || IS_ERR(mm))
|
|
||||||
goto out_free;
|
|
||||||
|
|
||||||
pagemap_walk.pmd_entry = pagemap_pte_range;
|
|
||||||
pagemap_walk.pte_hole = pagemap_pte_hole;
|
pagemap_walk.pte_hole = pagemap_pte_hole;
|
||||||
#ifdef CONFIG_HUGETLB_PAGE
|
#ifdef CONFIG_HUGETLB_PAGE
|
||||||
pagemap_walk.hugetlb_entry = pagemap_hugetlb_range;
|
pagemap_walk.hugetlb_entry = pagemap_hugetlb_range;
|
||||||
|
@ -1275,10 +1248,10 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
|
||||||
src = *ppos;
|
src = *ppos;
|
||||||
svpfn = src / PM_ENTRY_BYTES;
|
svpfn = src / PM_ENTRY_BYTES;
|
||||||
start_vaddr = svpfn << PAGE_SHIFT;
|
start_vaddr = svpfn << PAGE_SHIFT;
|
||||||
end_vaddr = TASK_SIZE_OF(task);
|
end_vaddr = mm->task_size;
|
||||||
|
|
||||||
/* watch out for wraparound */
|
/* watch out for wraparound */
|
||||||
if (svpfn > TASK_SIZE_OF(task) >> PAGE_SHIFT)
|
if (svpfn > mm->task_size >> PAGE_SHIFT)
|
||||||
start_vaddr = end_vaddr;
|
start_vaddr = end_vaddr;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -1305,7 +1278,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
|
||||||
len = min(count, PM_ENTRY_BYTES * pm.pos);
|
len = min(count, PM_ENTRY_BYTES * pm.pos);
|
||||||
if (copy_to_user(buf, pm.buffer, len)) {
|
if (copy_to_user(buf, pm.buffer, len)) {
|
||||||
ret = -EFAULT;
|
ret = -EFAULT;
|
||||||
goto out_mm;
|
goto out_free;
|
||||||
}
|
}
|
||||||
copied += len;
|
copied += len;
|
||||||
buf += len;
|
buf += len;
|
||||||
|
@ -1315,24 +1288,31 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
|
||||||
if (!ret || ret == PM_END_OF_BUFFER)
|
if (!ret || ret == PM_END_OF_BUFFER)
|
||||||
ret = copied;
|
ret = copied;
|
||||||
|
|
||||||
out_mm:
|
|
||||||
mmput(mm);
|
|
||||||
out_free:
|
out_free:
|
||||||
kfree(pm.buffer);
|
kfree(pm.buffer);
|
||||||
out_task:
|
out_mm:
|
||||||
put_task_struct(task);
|
mmput(mm);
|
||||||
out:
|
out:
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int pagemap_open(struct inode *inode, struct file *file)
|
static int pagemap_open(struct inode *inode, struct file *file)
|
||||||
{
|
{
|
||||||
/* do not disclose physical addresses: attack vector */
|
struct mm_struct *mm;
|
||||||
if (!capable(CAP_SYS_ADMIN))
|
|
||||||
return -EPERM;
|
mm = proc_mem_open(inode, PTRACE_MODE_READ);
|
||||||
pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about "
|
if (IS_ERR(mm))
|
||||||
"to stop being page-shift some time soon. See the "
|
return PTR_ERR(mm);
|
||||||
"linux/Documentation/vm/pagemap.txt for details.\n");
|
file->private_data = mm;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int pagemap_release(struct inode *inode, struct file *file)
|
||||||
|
{
|
||||||
|
struct mm_struct *mm = file->private_data;
|
||||||
|
|
||||||
|
if (mm)
|
||||||
|
mmdrop(mm);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1340,6 +1320,7 @@ const struct file_operations proc_pagemap_operations = {
|
||||||
.llseek = mem_lseek, /* borrow this */
|
.llseek = mem_lseek, /* borrow this */
|
||||||
.read = pagemap_read,
|
.read = pagemap_read,
|
||||||
.open = pagemap_open,
|
.open = pagemap_open,
|
||||||
|
.release = pagemap_release,
|
||||||
};
|
};
|
||||||
#endif /* CONFIG_PROC_PAGE_MONITOR */
|
#endif /* CONFIG_PROC_PAGE_MONITOR */
|
||||||
|
|
||||||
|
|
|
@ -23,6 +23,7 @@
|
||||||
#include <linux/spinlock.h>
|
#include <linux/spinlock.h>
|
||||||
#include <linux/mm.h>
|
#include <linux/mm.h>
|
||||||
#include <linux/fs.h>
|
#include <linux/fs.h>
|
||||||
|
#include <linux/dax.h>
|
||||||
#include <linux/buffer_head.h>
|
#include <linux/buffer_head.h>
|
||||||
#include <linux/uio.h>
|
#include <linux/uio.h>
|
||||||
#include <linux/list_lru.h>
|
#include <linux/list_lru.h>
|
||||||
|
|
|
@ -1546,8 +1546,36 @@ xfs_filemap_fault(
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
STATIC int
|
||||||
|
xfs_filemap_pmd_fault(
|
||||||
|
struct vm_area_struct *vma,
|
||||||
|
unsigned long addr,
|
||||||
|
pmd_t *pmd,
|
||||||
|
unsigned int flags)
|
||||||
|
{
|
||||||
|
struct inode *inode = file_inode(vma->vm_file);
|
||||||
|
struct xfs_inode *ip = XFS_I(inode);
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
if (!IS_DAX(inode))
|
||||||
|
return VM_FAULT_FALLBACK;
|
||||||
|
|
||||||
|
trace_xfs_filemap_pmd_fault(ip);
|
||||||
|
|
||||||
|
sb_start_pagefault(inode->i_sb);
|
||||||
|
file_update_time(vma->vm_file);
|
||||||
|
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
|
||||||
|
ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_direct,
|
||||||
|
xfs_end_io_dax_write);
|
||||||
|
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
|
||||||
|
sb_end_pagefault(inode->i_sb);
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
static const struct vm_operations_struct xfs_file_vm_ops = {
|
static const struct vm_operations_struct xfs_file_vm_ops = {
|
||||||
.fault = xfs_filemap_fault,
|
.fault = xfs_filemap_fault,
|
||||||
|
.pmd_fault = xfs_filemap_pmd_fault,
|
||||||
.map_pages = filemap_map_pages,
|
.map_pages = filemap_map_pages,
|
||||||
.page_mkwrite = xfs_filemap_page_mkwrite,
|
.page_mkwrite = xfs_filemap_page_mkwrite,
|
||||||
};
|
};
|
||||||
|
@ -1560,7 +1588,7 @@ xfs_file_mmap(
|
||||||
file_accessed(filp);
|
file_accessed(filp);
|
||||||
vma->vm_ops = &xfs_file_vm_ops;
|
vma->vm_ops = &xfs_file_vm_ops;
|
||||||
if (IS_DAX(file_inode(filp)))
|
if (IS_DAX(file_inode(filp)))
|
||||||
vma->vm_flags |= VM_MIXEDMAP;
|
vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -687,6 +687,7 @@ DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag);
|
||||||
DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid);
|
DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid);
|
||||||
|
|
||||||
DEFINE_INODE_EVENT(xfs_filemap_fault);
|
DEFINE_INODE_EVENT(xfs_filemap_fault);
|
||||||
|
DEFINE_INODE_EVENT(xfs_filemap_pmd_fault);
|
||||||
DEFINE_INODE_EVENT(xfs_filemap_page_mkwrite);
|
DEFINE_INODE_EVENT(xfs_filemap_page_mkwrite);
|
||||||
|
|
||||||
DECLARE_EVENT_CLASS(xfs_iref_class,
|
DECLARE_EVENT_CLASS(xfs_iref_class,
|
||||||
|
|
|
@ -35,6 +35,12 @@ extern void early_ioremap_setup(void);
|
||||||
*/
|
*/
|
||||||
extern void early_ioremap_reset(void);
|
extern void early_ioremap_reset(void);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Early copy from unmapped memory to kernel mapped memory.
|
||||||
|
*/
|
||||||
|
extern void copy_from_early_mem(void *dest, phys_addr_t src,
|
||||||
|
unsigned long size);
|
||||||
|
|
||||||
#else
|
#else
|
||||||
static inline void early_ioremap_init(void) { }
|
static inline void early_ioremap_init(void) { }
|
||||||
static inline void early_ioremap_setup(void) { }
|
static inline void early_ioremap_setup(void) { }
|
||||||
|
|
|
@ -0,0 +1,39 @@
|
||||||
|
#ifndef _LINUX_DAX_H
|
||||||
|
#define _LINUX_DAX_H
|
||||||
|
|
||||||
|
#include <linux/fs.h>
|
||||||
|
#include <linux/mm.h>
|
||||||
|
#include <asm/pgtable.h>
|
||||||
|
|
||||||
|
ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, loff_t,
|
||||||
|
get_block_t, dio_iodone_t, int flags);
|
||||||
|
int dax_clear_blocks(struct inode *, sector_t block, long size);
|
||||||
|
int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
|
||||||
|
int dax_truncate_page(struct inode *, loff_t from, get_block_t);
|
||||||
|
int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
|
||||||
|
dax_iodone_t);
|
||||||
|
int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
|
||||||
|
dax_iodone_t);
|
||||||
|
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||||
|
int dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
|
||||||
|
unsigned int flags, get_block_t, dax_iodone_t);
|
||||||
|
int __dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
|
||||||
|
unsigned int flags, get_block_t, dax_iodone_t);
|
||||||
|
#else
|
||||||
|
static inline int dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
|
||||||
|
pmd_t *pmd, unsigned int flags, get_block_t gb,
|
||||||
|
dax_iodone_t di)
|
||||||
|
{
|
||||||
|
return VM_FAULT_FALLBACK;
|
||||||
|
}
|
||||||
|
#define __dax_pmd_fault dax_pmd_fault
|
||||||
|
#endif
|
||||||
|
int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
|
||||||
|
#define dax_mkwrite(vma, vmf, gb, iod) dax_fault(vma, vmf, gb, iod)
|
||||||
|
#define __dax_mkwrite(vma, vmf, gb, iod) __dax_fault(vma, vmf, gb, iod)
|
||||||
|
|
||||||
|
static inline bool vma_is_dax(struct vm_area_struct *vma)
|
||||||
|
{
|
||||||
|
return vma->vm_file && IS_DAX(vma->vm_file->f_mapping->host);
|
||||||
|
}
|
||||||
|
#endif
|
|
@ -24,6 +24,12 @@ void dma_pool_destroy(struct dma_pool *pool);
|
||||||
void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
|
void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
|
||||||
dma_addr_t *handle);
|
dma_addr_t *handle);
|
||||||
|
|
||||||
|
static inline void *dma_pool_zalloc(struct dma_pool *pool, gfp_t mem_flags,
|
||||||
|
dma_addr_t *handle)
|
||||||
|
{
|
||||||
|
return dma_pool_alloc(pool, mem_flags | __GFP_ZERO, handle);
|
||||||
|
}
|
||||||
|
|
||||||
void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t addr);
|
void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t addr);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
|
@ -52,7 +52,6 @@ struct swap_info_struct;
|
||||||
struct seq_file;
|
struct seq_file;
|
||||||
struct workqueue_struct;
|
struct workqueue_struct;
|
||||||
struct iov_iter;
|
struct iov_iter;
|
||||||
struct vm_fault;
|
|
||||||
|
|
||||||
extern void __init inode_init(void);
|
extern void __init inode_init(void);
|
||||||
extern void __init inode_init_early(void);
|
extern void __init inode_init_early(void);
|
||||||
|
@ -2678,19 +2677,6 @@ extern loff_t fixed_size_llseek(struct file *file, loff_t offset,
|
||||||
extern int generic_file_open(struct inode * inode, struct file * filp);
|
extern int generic_file_open(struct inode * inode, struct file * filp);
|
||||||
extern int nonseekable_open(struct inode * inode, struct file * filp);
|
extern int nonseekable_open(struct inode * inode, struct file * filp);
|
||||||
|
|
||||||
ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, loff_t,
|
|
||||||
get_block_t, dio_iodone_t, int flags);
|
|
||||||
int dax_clear_blocks(struct inode *, sector_t block, long size);
|
|
||||||
int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
|
|
||||||
int dax_truncate_page(struct inode *, loff_t from, get_block_t);
|
|
||||||
int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
|
|
||||||
dax_iodone_t);
|
|
||||||
int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
|
|
||||||
dax_iodone_t);
|
|
||||||
int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
|
|
||||||
#define dax_mkwrite(vma, vmf, gb, iod) dax_fault(vma, vmf, gb, iod)
|
|
||||||
#define __dax_mkwrite(vma, vmf, gb, iod) __dax_fault(vma, vmf, gb, iod)
|
|
||||||
|
|
||||||
#ifdef CONFIG_BLOCK
|
#ifdef CONFIG_BLOCK
|
||||||
typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode,
|
typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode,
|
||||||
loff_t file_offset);
|
loff_t file_offset);
|
||||||
|
|
|
@ -63,7 +63,10 @@ struct vm_area_struct;
|
||||||
* but it is definitely preferable to use the flag rather than opencode endless
|
* but it is definitely preferable to use the flag rather than opencode endless
|
||||||
* loop around allocator.
|
* loop around allocator.
|
||||||
*
|
*
|
||||||
* __GFP_NORETRY: The VM implementation must not retry indefinitely.
|
* __GFP_NORETRY: The VM implementation must not retry indefinitely and will
|
||||||
|
* return NULL when direct reclaim and memory compaction have failed to allow
|
||||||
|
* the allocation to succeed. The OOM killer is not called with the current
|
||||||
|
* implementation.
|
||||||
*
|
*
|
||||||
* __GFP_MOVABLE: Flag that this page will be movable by the page migration
|
* __GFP_MOVABLE: Flag that this page will be movable by the page migration
|
||||||
* mechanism or reclaimed
|
* mechanism or reclaimed
|
||||||
|
@ -300,22 +303,31 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
|
||||||
return __alloc_pages_nodemask(gfp_mask, order, zonelist, NULL);
|
return __alloc_pages_nodemask(gfp_mask, order, zonelist, NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
|
/*
|
||||||
unsigned int order)
|
* Allocate pages, preferring the node given as nid. The node must be valid and
|
||||||
|
* online. For more general interface, see alloc_pages_node().
|
||||||
|
*/
|
||||||
|
static inline struct page *
|
||||||
|
__alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
|
||||||
{
|
{
|
||||||
/* Unknown node is current node */
|
VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
|
||||||
if (nid < 0)
|
VM_WARN_ON(!node_online(nid));
|
||||||
nid = numa_node_id();
|
|
||||||
|
|
||||||
return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask));
|
return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask));
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline struct page *alloc_pages_exact_node(int nid, gfp_t gfp_mask,
|
/*
|
||||||
|
* Allocate pages, preferring the node given as nid. When nid == NUMA_NO_NODE,
|
||||||
|
* prefer the current CPU's closest node. Otherwise node must be valid and
|
||||||
|
* online.
|
||||||
|
*/
|
||||||
|
static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
|
||||||
unsigned int order)
|
unsigned int order)
|
||||||
{
|
{
|
||||||
VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES || !node_online(nid));
|
if (nid == NUMA_NO_NODE)
|
||||||
|
nid = numa_mem_id();
|
||||||
|
|
||||||
return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask));
|
return __alloc_pages_node(nid, gfp_mask, order);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef CONFIG_NUMA
|
#ifdef CONFIG_NUMA
|
||||||
|
@ -354,7 +366,6 @@ extern unsigned long get_zeroed_page(gfp_t gfp_mask);
|
||||||
|
|
||||||
void *alloc_pages_exact(size_t size, gfp_t gfp_mask);
|
void *alloc_pages_exact(size_t size, gfp_t gfp_mask);
|
||||||
void free_pages_exact(void *virt, size_t size);
|
void free_pages_exact(void *virt, size_t size);
|
||||||
/* This is different from alloc_pages_exact_node !!! */
|
|
||||||
void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask);
|
void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask);
|
||||||
|
|
||||||
#define __get_free_page(gfp_mask) \
|
#define __get_free_page(gfp_mask) \
|
||||||
|
|
|
@ -33,6 +33,8 @@ extern int move_huge_pmd(struct vm_area_struct *vma,
|
||||||
extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
|
extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
|
||||||
unsigned long addr, pgprot_t newprot,
|
unsigned long addr, pgprot_t newprot,
|
||||||
int prot_numa);
|
int prot_numa);
|
||||||
|
int vmf_insert_pfn_pmd(struct vm_area_struct *, unsigned long addr, pmd_t *,
|
||||||
|
unsigned long pfn, bool write);
|
||||||
|
|
||||||
enum transparent_hugepage_flag {
|
enum transparent_hugepage_flag {
|
||||||
TRANSPARENT_HUGEPAGE_FLAG,
|
TRANSPARENT_HUGEPAGE_FLAG,
|
||||||
|
@ -122,7 +124,7 @@ extern void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
|
||||||
#endif
|
#endif
|
||||||
extern int hugepage_madvise(struct vm_area_struct *vma,
|
extern int hugepage_madvise(struct vm_area_struct *vma,
|
||||||
unsigned long *vm_flags, int advice);
|
unsigned long *vm_flags, int advice);
|
||||||
extern void __vma_adjust_trans_huge(struct vm_area_struct *vma,
|
extern void vma_adjust_trans_huge(struct vm_area_struct *vma,
|
||||||
unsigned long start,
|
unsigned long start,
|
||||||
unsigned long end,
|
unsigned long end,
|
||||||
long adjust_next);
|
long adjust_next);
|
||||||
|
@ -138,15 +140,6 @@ static inline int pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
|
||||||
else
|
else
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
|
|
||||||
unsigned long start,
|
|
||||||
unsigned long end,
|
|
||||||
long adjust_next)
|
|
||||||
{
|
|
||||||
if (!vma->anon_vma || vma->vm_ops)
|
|
||||||
return;
|
|
||||||
__vma_adjust_trans_huge(vma, start, end, adjust_next);
|
|
||||||
}
|
|
||||||
static inline int hpage_nr_pages(struct page *page)
|
static inline int hpage_nr_pages(struct page *page)
|
||||||
{
|
{
|
||||||
if (unlikely(PageTransHuge(page)))
|
if (unlikely(PageTransHuge(page)))
|
||||||
|
@ -164,6 +157,13 @@ static inline bool is_huge_zero_page(struct page *page)
|
||||||
return ACCESS_ONCE(huge_zero_page) == page;
|
return ACCESS_ONCE(huge_zero_page) == page;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline bool is_huge_zero_pmd(pmd_t pmd)
|
||||||
|
{
|
||||||
|
return is_huge_zero_page(pmd_page(pmd));
|
||||||
|
}
|
||||||
|
|
||||||
|
struct page *get_huge_zero_page(void);
|
||||||
|
|
||||||
#else /* CONFIG_TRANSPARENT_HUGEPAGE */
|
#else /* CONFIG_TRANSPARENT_HUGEPAGE */
|
||||||
#define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
|
#define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
|
||||||
#define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
|
#define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
|
||||||
|
|
|
@ -35,6 +35,9 @@ struct resv_map {
|
||||||
struct kref refs;
|
struct kref refs;
|
||||||
spinlock_t lock;
|
spinlock_t lock;
|
||||||
struct list_head regions;
|
struct list_head regions;
|
||||||
|
long adds_in_progress;
|
||||||
|
struct list_head region_cache;
|
||||||
|
long region_cache_count;
|
||||||
};
|
};
|
||||||
extern struct resv_map *resv_map_alloc(void);
|
extern struct resv_map *resv_map_alloc(void);
|
||||||
void resv_map_release(struct kref *ref);
|
void resv_map_release(struct kref *ref);
|
||||||
|
@ -80,11 +83,18 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||||
int hugetlb_reserve_pages(struct inode *inode, long from, long to,
|
int hugetlb_reserve_pages(struct inode *inode, long from, long to,
|
||||||
struct vm_area_struct *vma,
|
struct vm_area_struct *vma,
|
||||||
vm_flags_t vm_flags);
|
vm_flags_t vm_flags);
|
||||||
void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
|
long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
|
||||||
|
long freed);
|
||||||
int dequeue_hwpoisoned_huge_page(struct page *page);
|
int dequeue_hwpoisoned_huge_page(struct page *page);
|
||||||
bool isolate_huge_page(struct page *page, struct list_head *list);
|
bool isolate_huge_page(struct page *page, struct list_head *list);
|
||||||
void putback_active_hugepage(struct page *page);
|
void putback_active_hugepage(struct page *page);
|
||||||
void free_huge_page(struct page *page);
|
void free_huge_page(struct page *page);
|
||||||
|
void hugetlb_fix_reserve_counts(struct inode *inode, bool restore_reserve);
|
||||||
|
extern struct mutex *hugetlb_fault_mutex_table;
|
||||||
|
u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
|
||||||
|
struct vm_area_struct *vma,
|
||||||
|
struct address_space *mapping,
|
||||||
|
pgoff_t idx, unsigned long address);
|
||||||
|
|
||||||
#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
|
#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
|
||||||
pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud);
|
pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud);
|
||||||
|
@ -320,9 +330,13 @@ struct huge_bootmem_page {
|
||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct page *alloc_huge_page(struct vm_area_struct *vma,
|
||||||
|
unsigned long addr, int avoid_reserve);
|
||||||
struct page *alloc_huge_page_node(struct hstate *h, int nid);
|
struct page *alloc_huge_page_node(struct hstate *h, int nid);
|
||||||
struct page *alloc_huge_page_noerr(struct vm_area_struct *vma,
|
struct page *alloc_huge_page_noerr(struct vm_area_struct *vma,
|
||||||
unsigned long addr, int avoid_reserve);
|
unsigned long addr, int avoid_reserve);
|
||||||
|
int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
|
||||||
|
pgoff_t idx);
|
||||||
|
|
||||||
/* arch callback */
|
/* arch callback */
|
||||||
int __init alloc_bootmem_huge_page(struct hstate *h);
|
int __init alloc_bootmem_huge_page(struct hstate *h);
|
||||||
|
@ -471,6 +485,7 @@ static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
|
||||||
|
|
||||||
#else /* CONFIG_HUGETLB_PAGE */
|
#else /* CONFIG_HUGETLB_PAGE */
|
||||||
struct hstate {};
|
struct hstate {};
|
||||||
|
#define alloc_huge_page(v, a, r) NULL
|
||||||
#define alloc_huge_page_node(h, nid) NULL
|
#define alloc_huge_page_node(h, nid) NULL
|
||||||
#define alloc_huge_page_noerr(v, a, r) NULL
|
#define alloc_huge_page_noerr(v, a, r) NULL
|
||||||
#define alloc_bootmem_huge_page(h) NULL
|
#define alloc_bootmem_huge_page(h) NULL
|
||||||
|
|
|
@ -77,6 +77,8 @@ int memblock_remove(phys_addr_t base, phys_addr_t size);
|
||||||
int memblock_free(phys_addr_t base, phys_addr_t size);
|
int memblock_free(phys_addr_t base, phys_addr_t size);
|
||||||
int memblock_reserve(phys_addr_t base, phys_addr_t size);
|
int memblock_reserve(phys_addr_t base, phys_addr_t size);
|
||||||
void memblock_trim_memory(phys_addr_t align);
|
void memblock_trim_memory(phys_addr_t align);
|
||||||
|
bool memblock_overlaps_region(struct memblock_type *type,
|
||||||
|
phys_addr_t base, phys_addr_t size);
|
||||||
int memblock_mark_hotplug(phys_addr_t base, phys_addr_t size);
|
int memblock_mark_hotplug(phys_addr_t base, phys_addr_t size);
|
||||||
int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size);
|
int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size);
|
||||||
int memblock_mark_mirror(phys_addr_t base, phys_addr_t size);
|
int memblock_mark_mirror(phys_addr_t base, phys_addr_t size);
|
||||||
|
@ -323,7 +325,7 @@ void memblock_enforce_memory_limit(phys_addr_t memory_limit);
|
||||||
int memblock_is_memory(phys_addr_t addr);
|
int memblock_is_memory(phys_addr_t addr);
|
||||||
int memblock_is_region_memory(phys_addr_t base, phys_addr_t size);
|
int memblock_is_region_memory(phys_addr_t base, phys_addr_t size);
|
||||||
int memblock_is_reserved(phys_addr_t addr);
|
int memblock_is_reserved(phys_addr_t addr);
|
||||||
int memblock_is_region_reserved(phys_addr_t base, phys_addr_t size);
|
bool memblock_is_region_reserved(phys_addr_t base, phys_addr_t size);
|
||||||
|
|
||||||
extern void __memblock_dump_all(void);
|
extern void __memblock_dump_all(void);
|
||||||
|
|
||||||
|
|
|
@ -23,6 +23,11 @@
|
||||||
#include <linux/vm_event_item.h>
|
#include <linux/vm_event_item.h>
|
||||||
#include <linux/hardirq.h>
|
#include <linux/hardirq.h>
|
||||||
#include <linux/jump_label.h>
|
#include <linux/jump_label.h>
|
||||||
|
#include <linux/page_counter.h>
|
||||||
|
#include <linux/vmpressure.h>
|
||||||
|
#include <linux/eventfd.h>
|
||||||
|
#include <linux/mmzone.h>
|
||||||
|
#include <linux/writeback.h>
|
||||||
|
|
||||||
struct mem_cgroup;
|
struct mem_cgroup;
|
||||||
struct page;
|
struct page;
|
||||||
|
@ -67,12 +72,221 @@ enum mem_cgroup_events_index {
|
||||||
MEMCG_NR_EVENTS,
|
MEMCG_NR_EVENTS,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Per memcg event counter is incremented at every pagein/pageout. With THP,
|
||||||
|
* it will be incremated by the number of pages. This counter is used for
|
||||||
|
* for trigger some periodic events. This is straightforward and better
|
||||||
|
* than using jiffies etc. to handle periodic memcg event.
|
||||||
|
*/
|
||||||
|
enum mem_cgroup_events_target {
|
||||||
|
MEM_CGROUP_TARGET_THRESH,
|
||||||
|
MEM_CGROUP_TARGET_SOFTLIMIT,
|
||||||
|
MEM_CGROUP_TARGET_NUMAINFO,
|
||||||
|
MEM_CGROUP_NTARGETS,
|
||||||
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Bits in struct cg_proto.flags
|
||||||
|
*/
|
||||||
|
enum cg_proto_flags {
|
||||||
|
/* Currently active and new sockets should be assigned to cgroups */
|
||||||
|
MEMCG_SOCK_ACTIVE,
|
||||||
|
/* It was ever activated; we must disarm static keys on destruction */
|
||||||
|
MEMCG_SOCK_ACTIVATED,
|
||||||
|
};
|
||||||
|
|
||||||
|
struct cg_proto {
|
||||||
|
struct page_counter memory_allocated; /* Current allocated memory. */
|
||||||
|
struct percpu_counter sockets_allocated; /* Current number of sockets. */
|
||||||
|
int memory_pressure;
|
||||||
|
long sysctl_mem[3];
|
||||||
|
unsigned long flags;
|
||||||
|
/*
|
||||||
|
* memcg field is used to find which memcg we belong directly
|
||||||
|
* Each memcg struct can hold more than one cg_proto, so container_of
|
||||||
|
* won't really cut.
|
||||||
|
*
|
||||||
|
* The elegant solution would be having an inverse function to
|
||||||
|
* proto_cgroup in struct proto, but that means polluting the structure
|
||||||
|
* for everybody, instead of just for memcg users.
|
||||||
|
*/
|
||||||
|
struct mem_cgroup *memcg;
|
||||||
|
};
|
||||||
|
|
||||||
#ifdef CONFIG_MEMCG
|
#ifdef CONFIG_MEMCG
|
||||||
|
struct mem_cgroup_stat_cpu {
|
||||||
|
long count[MEM_CGROUP_STAT_NSTATS];
|
||||||
|
unsigned long events[MEMCG_NR_EVENTS];
|
||||||
|
unsigned long nr_page_events;
|
||||||
|
unsigned long targets[MEM_CGROUP_NTARGETS];
|
||||||
|
};
|
||||||
|
|
||||||
|
struct mem_cgroup_reclaim_iter {
|
||||||
|
struct mem_cgroup *position;
|
||||||
|
/* scan generation, increased every round-trip */
|
||||||
|
unsigned int generation;
|
||||||
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* per-zone information in memory controller.
|
||||||
|
*/
|
||||||
|
struct mem_cgroup_per_zone {
|
||||||
|
struct lruvec lruvec;
|
||||||
|
unsigned long lru_size[NR_LRU_LISTS];
|
||||||
|
|
||||||
|
struct mem_cgroup_reclaim_iter iter[DEF_PRIORITY + 1];
|
||||||
|
|
||||||
|
struct rb_node tree_node; /* RB tree node */
|
||||||
|
unsigned long usage_in_excess;/* Set to the value by which */
|
||||||
|
/* the soft limit is exceeded*/
|
||||||
|
bool on_tree;
|
||||||
|
struct mem_cgroup *memcg; /* Back pointer, we cannot */
|
||||||
|
/* use container_of */
|
||||||
|
};
|
||||||
|
|
||||||
|
struct mem_cgroup_per_node {
|
||||||
|
struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
|
||||||
|
};
|
||||||
|
|
||||||
|
struct mem_cgroup_threshold {
|
||||||
|
struct eventfd_ctx *eventfd;
|
||||||
|
unsigned long threshold;
|
||||||
|
};
|
||||||
|
|
||||||
|
/* For threshold */
|
||||||
|
struct mem_cgroup_threshold_ary {
|
||||||
|
/* An array index points to threshold just below or equal to usage. */
|
||||||
|
int current_threshold;
|
||||||
|
/* Size of entries[] */
|
||||||
|
unsigned int size;
|
||||||
|
/* Array of thresholds */
|
||||||
|
struct mem_cgroup_threshold entries[0];
|
||||||
|
};
|
||||||
|
|
||||||
|
struct mem_cgroup_thresholds {
|
||||||
|
/* Primary thresholds array */
|
||||||
|
struct mem_cgroup_threshold_ary *primary;
|
||||||
|
/*
|
||||||
|
* Spare threshold array.
|
||||||
|
* This is needed to make mem_cgroup_unregister_event() "never fail".
|
||||||
|
* It must be able to store at least primary->size - 1 entries.
|
||||||
|
*/
|
||||||
|
struct mem_cgroup_threshold_ary *spare;
|
||||||
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The memory controller data structure. The memory controller controls both
|
||||||
|
* page cache and RSS per cgroup. We would eventually like to provide
|
||||||
|
* statistics based on the statistics developed by Rik Van Riel for clock-pro,
|
||||||
|
* to help the administrator determine what knobs to tune.
|
||||||
|
*/
|
||||||
|
struct mem_cgroup {
|
||||||
|
struct cgroup_subsys_state css;
|
||||||
|
|
||||||
|
/* Accounted resources */
|
||||||
|
struct page_counter memory;
|
||||||
|
struct page_counter memsw;
|
||||||
|
struct page_counter kmem;
|
||||||
|
|
||||||
|
/* Normal memory consumption range */
|
||||||
|
unsigned long low;
|
||||||
|
unsigned long high;
|
||||||
|
|
||||||
|
unsigned long soft_limit;
|
||||||
|
|
||||||
|
/* vmpressure notifications */
|
||||||
|
struct vmpressure vmpressure;
|
||||||
|
|
||||||
|
/* css_online() has been completed */
|
||||||
|
int initialized;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Should the accounting and control be hierarchical, per subtree?
|
||||||
|
*/
|
||||||
|
bool use_hierarchy;
|
||||||
|
|
||||||
|
/* protected by memcg_oom_lock */
|
||||||
|
bool oom_lock;
|
||||||
|
int under_oom;
|
||||||
|
|
||||||
|
int swappiness;
|
||||||
|
/* OOM-Killer disable */
|
||||||
|
int oom_kill_disable;
|
||||||
|
|
||||||
|
/* protect arrays of thresholds */
|
||||||
|
struct mutex thresholds_lock;
|
||||||
|
|
||||||
|
/* thresholds for memory usage. RCU-protected */
|
||||||
|
struct mem_cgroup_thresholds thresholds;
|
||||||
|
|
||||||
|
/* thresholds for mem+swap usage. RCU-protected */
|
||||||
|
struct mem_cgroup_thresholds memsw_thresholds;
|
||||||
|
|
||||||
|
/* For oom notifier event fd */
|
||||||
|
struct list_head oom_notify;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Should we move charges of a task when a task is moved into this
|
||||||
|
* mem_cgroup ? And what type of charges should we move ?
|
||||||
|
*/
|
||||||
|
unsigned long move_charge_at_immigrate;
|
||||||
|
/*
|
||||||
|
* set > 0 if pages under this cgroup are moving to other cgroup.
|
||||||
|
*/
|
||||||
|
atomic_t moving_account;
|
||||||
|
/* taken only while moving_account > 0 */
|
||||||
|
spinlock_t move_lock;
|
||||||
|
struct task_struct *move_lock_task;
|
||||||
|
unsigned long move_lock_flags;
|
||||||
|
/*
|
||||||
|
* percpu counter.
|
||||||
|
*/
|
||||||
|
struct mem_cgroup_stat_cpu __percpu *stat;
|
||||||
|
spinlock_t pcp_counter_lock;
|
||||||
|
|
||||||
|
#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
|
||||||
|
struct cg_proto tcp_mem;
|
||||||
|
#endif
|
||||||
|
#if defined(CONFIG_MEMCG_KMEM)
|
||||||
|
/* Index in the kmem_cache->memcg_params.memcg_caches array */
|
||||||
|
int kmemcg_id;
|
||||||
|
bool kmem_acct_activated;
|
||||||
|
bool kmem_acct_active;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
int last_scanned_node;
|
||||||
|
#if MAX_NUMNODES > 1
|
||||||
|
nodemask_t scan_nodes;
|
||||||
|
atomic_t numainfo_events;
|
||||||
|
atomic_t numainfo_updating;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef CONFIG_CGROUP_WRITEBACK
|
||||||
|
struct list_head cgwb_list;
|
||||||
|
struct wb_domain cgwb_domain;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* List of events which userspace want to receive */
|
||||||
|
struct list_head event_list;
|
||||||
|
spinlock_t event_list_lock;
|
||||||
|
|
||||||
|
struct mem_cgroup_per_node *nodeinfo[0];
|
||||||
|
/* WARNING: nodeinfo must be the last member here */
|
||||||
|
};
|
||||||
extern struct cgroup_subsys_state *mem_cgroup_root_css;
|
extern struct cgroup_subsys_state *mem_cgroup_root_css;
|
||||||
|
|
||||||
void mem_cgroup_events(struct mem_cgroup *memcg,
|
/**
|
||||||
|
* mem_cgroup_events - count memory events against a cgroup
|
||||||
|
* @memcg: the memory cgroup
|
||||||
|
* @idx: the event index
|
||||||
|
* @nr: the number of events to account for
|
||||||
|
*/
|
||||||
|
static inline void mem_cgroup_events(struct mem_cgroup *memcg,
|
||||||
enum mem_cgroup_events_index idx,
|
enum mem_cgroup_events_index idx,
|
||||||
unsigned int nr);
|
unsigned int nr)
|
||||||
|
{
|
||||||
|
this_cpu_add(memcg->stat->events[idx], nr);
|
||||||
|
}
|
||||||
|
|
||||||
bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg);
|
bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg);
|
||||||
|
|
||||||
|
@ -90,15 +304,31 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
|
||||||
struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *);
|
struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *);
|
||||||
struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *);
|
struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *);
|
||||||
|
|
||||||
bool mem_cgroup_is_descendant(struct mem_cgroup *memcg,
|
|
||||||
struct mem_cgroup *root);
|
|
||||||
bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg);
|
bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg);
|
||||||
|
|
||||||
extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page);
|
struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page);
|
||||||
extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
|
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
|
||||||
|
|
||||||
extern struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg);
|
struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg);
|
||||||
extern struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css);
|
static inline
|
||||||
|
struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
|
||||||
|
return css ? container_of(css, struct mem_cgroup, css) : NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
|
||||||
|
struct mem_cgroup *,
|
||||||
|
struct mem_cgroup_reclaim_cookie *);
|
||||||
|
void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
|
||||||
|
|
||||||
|
static inline bool mem_cgroup_is_descendant(struct mem_cgroup *memcg,
|
||||||
|
struct mem_cgroup *root)
|
||||||
|
{
|
||||||
|
if (root == memcg)
|
||||||
|
return true;
|
||||||
|
if (!root->use_hierarchy)
|
||||||
|
return false;
|
||||||
|
return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup);
|
||||||
|
}
|
||||||
|
|
||||||
static inline bool mm_match_cgroup(struct mm_struct *mm,
|
static inline bool mm_match_cgroup(struct mm_struct *mm,
|
||||||
struct mem_cgroup *memcg)
|
struct mem_cgroup *memcg)
|
||||||
|
@ -114,24 +344,67 @@ static inline bool mm_match_cgroup(struct mm_struct *mm,
|
||||||
return match;
|
return match;
|
||||||
}
|
}
|
||||||
|
|
||||||
extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg);
|
struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page);
|
||||||
extern struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page);
|
|
||||||
|
|
||||||
struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
|
static inline bool mem_cgroup_disabled(void)
|
||||||
struct mem_cgroup *,
|
{
|
||||||
struct mem_cgroup_reclaim_cookie *);
|
if (memory_cgrp_subsys.disabled)
|
||||||
void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
|
return true;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* For memory reclaim.
|
* For memory reclaim.
|
||||||
*/
|
*/
|
||||||
int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec);
|
|
||||||
bool mem_cgroup_lruvec_online(struct lruvec *lruvec);
|
|
||||||
int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
|
int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
|
||||||
unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list);
|
|
||||||
void mem_cgroup_update_lru_size(struct lruvec *, enum lru_list, int);
|
void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
|
||||||
extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
|
int nr_pages);
|
||||||
struct task_struct *p);
|
|
||||||
|
static inline bool mem_cgroup_lruvec_online(struct lruvec *lruvec)
|
||||||
|
{
|
||||||
|
struct mem_cgroup_per_zone *mz;
|
||||||
|
struct mem_cgroup *memcg;
|
||||||
|
|
||||||
|
if (mem_cgroup_disabled())
|
||||||
|
return true;
|
||||||
|
|
||||||
|
mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
|
||||||
|
memcg = mz->memcg;
|
||||||
|
|
||||||
|
return !!(memcg->css.flags & CSS_ONLINE);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline
|
||||||
|
unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
|
||||||
|
{
|
||||||
|
struct mem_cgroup_per_zone *mz;
|
||||||
|
|
||||||
|
mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
|
||||||
|
return mz->lru_size[lru];
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
|
||||||
|
{
|
||||||
|
unsigned long inactive_ratio;
|
||||||
|
unsigned long inactive;
|
||||||
|
unsigned long active;
|
||||||
|
unsigned long gb;
|
||||||
|
|
||||||
|
inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
|
||||||
|
active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
|
||||||
|
|
||||||
|
gb = (inactive + active) >> (30 - PAGE_SHIFT);
|
||||||
|
if (gb)
|
||||||
|
inactive_ratio = int_sqrt(10 * gb);
|
||||||
|
else
|
||||||
|
inactive_ratio = 1;
|
||||||
|
|
||||||
|
return inactive * inactive_ratio < active;
|
||||||
|
}
|
||||||
|
|
||||||
|
void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
|
||||||
|
struct task_struct *p);
|
||||||
|
|
||||||
static inline void mem_cgroup_oom_enable(void)
|
static inline void mem_cgroup_oom_enable(void)
|
||||||
{
|
{
|
||||||
|
@ -156,18 +429,26 @@ bool mem_cgroup_oom_synchronize(bool wait);
|
||||||
extern int do_swap_account;
|
extern int do_swap_account;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static inline bool mem_cgroup_disabled(void)
|
|
||||||
{
|
|
||||||
if (memory_cgrp_subsys.disabled)
|
|
||||||
return true;
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page);
|
struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page);
|
||||||
void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
|
|
||||||
enum mem_cgroup_stat_index idx, int val);
|
|
||||||
void mem_cgroup_end_page_stat(struct mem_cgroup *memcg);
|
void mem_cgroup_end_page_stat(struct mem_cgroup *memcg);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* mem_cgroup_update_page_stat - update page state statistics
|
||||||
|
* @memcg: memcg to account against
|
||||||
|
* @idx: page state item to account
|
||||||
|
* @val: number of pages (positive or negative)
|
||||||
|
*
|
||||||
|
* See mem_cgroup_begin_page_stat() for locking requirements.
|
||||||
|
*/
|
||||||
|
static inline void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
|
||||||
|
enum mem_cgroup_stat_index idx, int val)
|
||||||
|
{
|
||||||
|
VM_BUG_ON(!rcu_read_lock_held());
|
||||||
|
|
||||||
|
if (memcg)
|
||||||
|
this_cpu_add(memcg->stat->count[idx], val);
|
||||||
|
}
|
||||||
|
|
||||||
static inline void mem_cgroup_inc_page_stat(struct mem_cgroup *memcg,
|
static inline void mem_cgroup_inc_page_stat(struct mem_cgroup *memcg,
|
||||||
enum mem_cgroup_stat_index idx)
|
enum mem_cgroup_stat_index idx)
|
||||||
{
|
{
|
||||||
|
@ -184,13 +465,31 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
|
||||||
gfp_t gfp_mask,
|
gfp_t gfp_mask,
|
||||||
unsigned long *total_scanned);
|
unsigned long *total_scanned);
|
||||||
|
|
||||||
void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx);
|
|
||||||
static inline void mem_cgroup_count_vm_event(struct mm_struct *mm,
|
static inline void mem_cgroup_count_vm_event(struct mm_struct *mm,
|
||||||
enum vm_event_item idx)
|
enum vm_event_item idx)
|
||||||
{
|
{
|
||||||
|
struct mem_cgroup *memcg;
|
||||||
|
|
||||||
if (mem_cgroup_disabled())
|
if (mem_cgroup_disabled())
|
||||||
return;
|
return;
|
||||||
__mem_cgroup_count_vm_event(mm, idx);
|
|
||||||
|
rcu_read_lock();
|
||||||
|
memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
|
||||||
|
if (unlikely(!memcg))
|
||||||
|
goto out;
|
||||||
|
|
||||||
|
switch (idx) {
|
||||||
|
case PGFAULT:
|
||||||
|
this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
|
||||||
|
break;
|
||||||
|
case PGMAJFAULT:
|
||||||
|
this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
BUG();
|
||||||
|
}
|
||||||
|
out:
|
||||||
|
rcu_read_unlock();
|
||||||
}
|
}
|
||||||
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||||
void mem_cgroup_split_huge_fixup(struct page *head);
|
void mem_cgroup_split_huge_fixup(struct page *head);
|
||||||
|
@ -199,8 +498,6 @@ void mem_cgroup_split_huge_fixup(struct page *head);
|
||||||
#else /* CONFIG_MEMCG */
|
#else /* CONFIG_MEMCG */
|
||||||
struct mem_cgroup;
|
struct mem_cgroup;
|
||||||
|
|
||||||
#define mem_cgroup_root_css ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL))
|
|
||||||
|
|
||||||
static inline void mem_cgroup_events(struct mem_cgroup *memcg,
|
static inline void mem_cgroup_events(struct mem_cgroup *memcg,
|
||||||
enum mem_cgroup_events_index idx,
|
enum mem_cgroup_events_index idx,
|
||||||
unsigned int nr)
|
unsigned int nr)
|
||||||
|
@ -275,12 +572,6 @@ static inline bool task_in_mem_cgroup(struct task_struct *task,
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline struct cgroup_subsys_state
|
|
||||||
*mem_cgroup_css(struct mem_cgroup *memcg)
|
|
||||||
{
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline struct mem_cgroup *
|
static inline struct mem_cgroup *
|
||||||
mem_cgroup_iter(struct mem_cgroup *root,
|
mem_cgroup_iter(struct mem_cgroup *root,
|
||||||
struct mem_cgroup *prev,
|
struct mem_cgroup *prev,
|
||||||
|
@ -428,8 +719,8 @@ static inline void sock_release_memcg(struct sock *sk)
|
||||||
extern struct static_key memcg_kmem_enabled_key;
|
extern struct static_key memcg_kmem_enabled_key;
|
||||||
|
|
||||||
extern int memcg_nr_cache_ids;
|
extern int memcg_nr_cache_ids;
|
||||||
extern void memcg_get_cache_ids(void);
|
void memcg_get_cache_ids(void);
|
||||||
extern void memcg_put_cache_ids(void);
|
void memcg_put_cache_ids(void);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Helper macro to loop through all memcg-specific caches. Callers must still
|
* Helper macro to loop through all memcg-specific caches. Callers must still
|
||||||
|
@ -444,7 +735,10 @@ static inline bool memcg_kmem_enabled(void)
|
||||||
return static_key_false(&memcg_kmem_enabled_key);
|
return static_key_false(&memcg_kmem_enabled_key);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool memcg_kmem_is_active(struct mem_cgroup *memcg);
|
static inline bool memcg_kmem_is_active(struct mem_cgroup *memcg)
|
||||||
|
{
|
||||||
|
return memcg->kmem_acct_active;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* In general, we'll do everything in our power to not incur in any overhead
|
* In general, we'll do everything in our power to not incur in any overhead
|
||||||
|
@ -463,7 +757,15 @@ void __memcg_kmem_commit_charge(struct page *page,
|
||||||
struct mem_cgroup *memcg, int order);
|
struct mem_cgroup *memcg, int order);
|
||||||
void __memcg_kmem_uncharge_pages(struct page *page, int order);
|
void __memcg_kmem_uncharge_pages(struct page *page, int order);
|
||||||
|
|
||||||
int memcg_cache_id(struct mem_cgroup *memcg);
|
/*
|
||||||
|
* helper for acessing a memcg's index. It will be used as an index in the
|
||||||
|
* child cache array in kmem_cache, and also to derive its name. This function
|
||||||
|
* will return -1 when this is not a kmem-limited memcg.
|
||||||
|
*/
|
||||||
|
static inline int memcg_cache_id(struct mem_cgroup *memcg)
|
||||||
|
{
|
||||||
|
return memcg ? memcg->kmemcg_id : -1;
|
||||||
|
}
|
||||||
|
|
||||||
struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep);
|
struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep);
|
||||||
void __memcg_kmem_put_cache(struct kmem_cache *cachep);
|
void __memcg_kmem_put_cache(struct kmem_cache *cachep);
|
||||||
|
|
|
@ -249,6 +249,8 @@ struct vm_operations_struct {
|
||||||
void (*close)(struct vm_area_struct * area);
|
void (*close)(struct vm_area_struct * area);
|
||||||
int (*mremap)(struct vm_area_struct * area);
|
int (*mremap)(struct vm_area_struct * area);
|
||||||
int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
|
int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
|
||||||
|
int (*pmd_fault)(struct vm_area_struct *, unsigned long address,
|
||||||
|
pmd_t *, unsigned int flags);
|
||||||
void (*map_pages)(struct vm_area_struct *vma, struct vm_fault *vmf);
|
void (*map_pages)(struct vm_area_struct *vma, struct vm_fault *vmf);
|
||||||
|
|
||||||
/* notification that a previously read-only page is about to become
|
/* notification that a previously read-only page is about to become
|
||||||
|
@ -307,18 +309,6 @@ struct inode;
|
||||||
#define page_private(page) ((page)->private)
|
#define page_private(page) ((page)->private)
|
||||||
#define set_page_private(page, v) ((page)->private = (v))
|
#define set_page_private(page, v) ((page)->private = (v))
|
||||||
|
|
||||||
/* It's valid only if the page is free path or free_list */
|
|
||||||
static inline void set_freepage_migratetype(struct page *page, int migratetype)
|
|
||||||
{
|
|
||||||
page->index = migratetype;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* It's valid only if the page is free path or free_list */
|
|
||||||
static inline int get_freepage_migratetype(struct page *page)
|
|
||||||
{
|
|
||||||
return page->index;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* FIXME: take this include out, include page-flags.h in
|
* FIXME: take this include out, include page-flags.h in
|
||||||
* files which need it (119 of them)
|
* files which need it (119 of them)
|
||||||
|
@ -359,18 +349,6 @@ static inline int get_page_unless_zero(struct page *page)
|
||||||
return atomic_inc_not_zero(&page->_count);
|
return atomic_inc_not_zero(&page->_count);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* Try to drop a ref unless the page has a refcount of one, return false if
|
|
||||||
* that is the case.
|
|
||||||
* This is to make sure that the refcount won't become zero after this drop.
|
|
||||||
* This can be called when MMU is off so it must not access
|
|
||||||
* any of the virtual mappings.
|
|
||||||
*/
|
|
||||||
static inline int put_page_unless_one(struct page *page)
|
|
||||||
{
|
|
||||||
return atomic_add_unless(&page->_count, -1, 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
extern int page_is_ram(unsigned long pfn);
|
extern int page_is_ram(unsigned long pfn);
|
||||||
|
|
||||||
enum {
|
enum {
|
||||||
|
@ -1267,6 +1245,11 @@ static inline int vma_growsdown(struct vm_area_struct *vma, unsigned long addr)
|
||||||
return vma && (vma->vm_end == addr) && (vma->vm_flags & VM_GROWSDOWN);
|
return vma && (vma->vm_end == addr) && (vma->vm_flags & VM_GROWSDOWN);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline bool vma_is_anonymous(struct vm_area_struct *vma)
|
||||||
|
{
|
||||||
|
return !vma->vm_ops;
|
||||||
|
}
|
||||||
|
|
||||||
static inline int stack_guard_page_start(struct vm_area_struct *vma,
|
static inline int stack_guard_page_start(struct vm_area_struct *vma,
|
||||||
unsigned long addr)
|
unsigned long addr)
|
||||||
{
|
{
|
||||||
|
@ -2193,6 +2176,7 @@ extern int memory_failure(unsigned long pfn, int trapno, int flags);
|
||||||
extern void memory_failure_queue(unsigned long pfn, int trapno, int flags);
|
extern void memory_failure_queue(unsigned long pfn, int trapno, int flags);
|
||||||
extern int unpoison_memory(unsigned long pfn);
|
extern int unpoison_memory(unsigned long pfn);
|
||||||
extern int get_hwpoison_page(struct page *page);
|
extern int get_hwpoison_page(struct page *page);
|
||||||
|
extern void put_hwpoison_page(struct page *page);
|
||||||
extern int sysctl_memory_failure_early_kill;
|
extern int sysctl_memory_failure_early_kill;
|
||||||
extern int sysctl_memory_failure_recovery;
|
extern int sysctl_memory_failure_recovery;
|
||||||
extern void shake_page(struct page *p, int access);
|
extern void shake_page(struct page *p, int access);
|
||||||
|
|
|
@ -235,7 +235,7 @@ struct page_frag_cache {
|
||||||
bool pfmemalloc;
|
bool pfmemalloc;
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef unsigned long __nocast vm_flags_t;
|
typedef unsigned long vm_flags_t;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* A region containing a mapping of a non-memory backed file under NOMMU
|
* A region containing a mapping of a non-memory backed file under NOMMU
|
||||||
|
|
|
@ -12,6 +12,27 @@ struct notifier_block;
|
||||||
struct mem_cgroup;
|
struct mem_cgroup;
|
||||||
struct task_struct;
|
struct task_struct;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Details of the page allocation that triggered the oom killer that are used to
|
||||||
|
* determine what should be killed.
|
||||||
|
*/
|
||||||
|
struct oom_control {
|
||||||
|
/* Used to determine cpuset */
|
||||||
|
struct zonelist *zonelist;
|
||||||
|
|
||||||
|
/* Used to determine mempolicy */
|
||||||
|
nodemask_t *nodemask;
|
||||||
|
|
||||||
|
/* Used to determine cpuset and node locality requirement */
|
||||||
|
const gfp_t gfp_mask;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* order == -1 means the oom kill is required by sysrq, otherwise only
|
||||||
|
* for display purposes.
|
||||||
|
*/
|
||||||
|
const int order;
|
||||||
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Types of limitations to the nodes from which allocations may occur
|
* Types of limitations to the nodes from which allocations may occur
|
||||||
*/
|
*/
|
||||||
|
@ -57,21 +78,18 @@ extern unsigned long oom_badness(struct task_struct *p,
|
||||||
|
|
||||||
extern int oom_kills_count(void);
|
extern int oom_kills_count(void);
|
||||||
extern void note_oom_kill(void);
|
extern void note_oom_kill(void);
|
||||||
extern void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
|
extern void oom_kill_process(struct oom_control *oc, struct task_struct *p,
|
||||||
unsigned int points, unsigned long totalpages,
|
unsigned int points, unsigned long totalpages,
|
||||||
struct mem_cgroup *memcg, nodemask_t *nodemask,
|
struct mem_cgroup *memcg, const char *message);
|
||||||
const char *message);
|
|
||||||
|
|
||||||
extern void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
|
extern void check_panic_on_oom(struct oom_control *oc,
|
||||||
int order, const nodemask_t *nodemask,
|
enum oom_constraint constraint,
|
||||||
struct mem_cgroup *memcg);
|
struct mem_cgroup *memcg);
|
||||||
|
|
||||||
extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
|
extern enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
|
||||||
unsigned long totalpages, const nodemask_t *nodemask,
|
struct task_struct *task, unsigned long totalpages);
|
||||||
bool force_kill);
|
|
||||||
|
|
||||||
extern bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
|
extern bool out_of_memory(struct oom_control *oc);
|
||||||
int order, nodemask_t *mask, bool force_kill);
|
|
||||||
|
|
||||||
extern void exit_oom_victim(void);
|
extern void exit_oom_victim(void);
|
||||||
|
|
||||||
|
|
|
@ -65,11 +65,6 @@ undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
|
||||||
int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
|
int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
|
||||||
bool skip_hwpoisoned_pages);
|
bool skip_hwpoisoned_pages);
|
||||||
|
|
||||||
/*
|
|
||||||
* Internal functions. Changes pageblock's migrate type.
|
|
||||||
*/
|
|
||||||
int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages);
|
|
||||||
void unset_migratetype_isolate(struct page *page, unsigned migratetype);
|
|
||||||
struct page *alloc_migrate_target(struct page *page, unsigned long private,
|
struct page *alloc_migrate_target(struct page *page, unsigned long private,
|
||||||
int **resultp);
|
int **resultp);
|
||||||
|
|
||||||
|
|
|
@ -1227,6 +1227,8 @@ int pci_set_vga_state(struct pci_dev *pdev, bool decode,
|
||||||
dma_pool_create(name, &pdev->dev, size, align, allocation)
|
dma_pool_create(name, &pdev->dev, size, align, allocation)
|
||||||
#define pci_pool_destroy(pool) dma_pool_destroy(pool)
|
#define pci_pool_destroy(pool) dma_pool_destroy(pool)
|
||||||
#define pci_pool_alloc(pool, flags, handle) dma_pool_alloc(pool, flags, handle)
|
#define pci_pool_alloc(pool, flags, handle) dma_pool_alloc(pool, flags, handle)
|
||||||
|
#define pci_pool_zalloc(pool, flags, handle) \
|
||||||
|
dma_pool_zalloc(pool, flags, handle)
|
||||||
#define pci_pool_free(pool, vaddr, addr) dma_pool_free(pool, vaddr, addr)
|
#define pci_pool_free(pool, vaddr, addr) dma_pool_free(pool, vaddr, addr)
|
||||||
|
|
||||||
struct msix_entry {
|
struct msix_entry {
|
||||||
|
|
|
@ -351,7 +351,15 @@ extern void check_move_unevictable_pages(struct page **, int nr_pages);
|
||||||
extern int kswapd_run(int nid);
|
extern int kswapd_run(int nid);
|
||||||
extern void kswapd_stop(int nid);
|
extern void kswapd_stop(int nid);
|
||||||
#ifdef CONFIG_MEMCG
|
#ifdef CONFIG_MEMCG
|
||||||
extern int mem_cgroup_swappiness(struct mem_cgroup *mem);
|
static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
|
||||||
|
{
|
||||||
|
/* root ? */
|
||||||
|
if (mem_cgroup_disabled() || !memcg->css.parent)
|
||||||
|
return vm_swappiness;
|
||||||
|
|
||||||
|
return memcg->swappiness;
|
||||||
|
}
|
||||||
|
|
||||||
#else
|
#else
|
||||||
static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
|
static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
|
||||||
{
|
{
|
||||||
|
@ -398,6 +406,9 @@ extern void free_pages_and_swap_cache(struct page **, int);
|
||||||
extern struct page *lookup_swap_cache(swp_entry_t);
|
extern struct page *lookup_swap_cache(swp_entry_t);
|
||||||
extern struct page *read_swap_cache_async(swp_entry_t, gfp_t,
|
extern struct page *read_swap_cache_async(swp_entry_t, gfp_t,
|
||||||
struct vm_area_struct *vma, unsigned long addr);
|
struct vm_area_struct *vma, unsigned long addr);
|
||||||
|
extern struct page *__read_swap_cache_async(swp_entry_t, gfp_t,
|
||||||
|
struct vm_area_struct *vma, unsigned long addr,
|
||||||
|
bool *new_page_allocated);
|
||||||
extern struct page *swapin_readahead(swp_entry_t, gfp_t,
|
extern struct page *swapin_readahead(swp_entry_t, gfp_t,
|
||||||
struct vm_area_struct *vma, unsigned long addr);
|
struct vm_area_struct *vma, unsigned long addr);
|
||||||
|
|
||||||
|
@ -431,6 +442,7 @@ extern unsigned int count_swap_pages(int, int);
|
||||||
extern sector_t map_swap_page(struct page *, struct block_device **);
|
extern sector_t map_swap_page(struct page *, struct block_device **);
|
||||||
extern sector_t swapdev_block(int, pgoff_t);
|
extern sector_t swapdev_block(int, pgoff_t);
|
||||||
extern int page_swapcount(struct page *);
|
extern int page_swapcount(struct page *);
|
||||||
|
extern int swp_swapcount(swp_entry_t entry);
|
||||||
extern struct swap_info_struct *page_swap_info(struct page *);
|
extern struct swap_info_struct *page_swap_info(struct page *);
|
||||||
extern int reuse_swap_page(struct page *);
|
extern int reuse_swap_page(struct page *);
|
||||||
extern int try_to_free_swap(struct page *);
|
extern int try_to_free_swap(struct page *);
|
||||||
|
@ -522,6 +534,11 @@ static inline int page_swapcount(struct page *page)
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline int swp_swapcount(swp_entry_t entry)
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
#define reuse_swap_page(page) (page_mapcount(page) == 1)
|
#define reuse_swap_page(page) (page_mapcount(page) == 1)
|
||||||
|
|
||||||
static inline int try_to_free_swap(struct page *page)
|
static inline int try_to_free_swap(struct page *page)
|
||||||
|
|
|
@ -164,6 +164,9 @@ static inline int is_write_migration_entry(swp_entry_t entry)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef CONFIG_MEMORY_FAILURE
|
#ifdef CONFIG_MEMORY_FAILURE
|
||||||
|
|
||||||
|
extern atomic_long_t num_poisoned_pages __read_mostly;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Support for hardware poisoned pages
|
* Support for hardware poisoned pages
|
||||||
*/
|
*/
|
||||||
|
@ -177,6 +180,31 @@ static inline int is_hwpoison_entry(swp_entry_t entry)
|
||||||
{
|
{
|
||||||
return swp_type(entry) == SWP_HWPOISON;
|
return swp_type(entry) == SWP_HWPOISON;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline bool test_set_page_hwpoison(struct page *page)
|
||||||
|
{
|
||||||
|
return TestSetPageHWPoison(page);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void num_poisoned_pages_inc(void)
|
||||||
|
{
|
||||||
|
atomic_long_inc(&num_poisoned_pages);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void num_poisoned_pages_dec(void)
|
||||||
|
{
|
||||||
|
atomic_long_dec(&num_poisoned_pages);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void num_poisoned_pages_add(long num)
|
||||||
|
{
|
||||||
|
atomic_long_add(num, &num_poisoned_pages);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void num_poisoned_pages_sub(long num)
|
||||||
|
{
|
||||||
|
atomic_long_sub(num, &num_poisoned_pages);
|
||||||
|
}
|
||||||
#else
|
#else
|
||||||
|
|
||||||
static inline swp_entry_t make_hwpoison_entry(struct page *page)
|
static inline swp_entry_t make_hwpoison_entry(struct page *page)
|
||||||
|
@ -188,6 +216,15 @@ static inline int is_hwpoison_entry(swp_entry_t swp)
|
||||||
{
|
{
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline bool test_set_page_hwpoison(struct page *page)
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void num_poisoned_pages_inc(void)
|
||||||
|
{
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(CONFIG_MEMORY_FAILURE) || defined(CONFIG_MIGRATION)
|
#if defined(CONFIG_MEMORY_FAILURE) || defined(CONFIG_MIGRATION)
|
||||||
|
|
|
@ -9,7 +9,7 @@ struct zbud_ops {
|
||||||
int (*evict)(struct zbud_pool *pool, unsigned long handle);
|
int (*evict)(struct zbud_pool *pool, unsigned long handle);
|
||||||
};
|
};
|
||||||
|
|
||||||
struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops);
|
struct zbud_pool *zbud_create_pool(gfp_t gfp, const struct zbud_ops *ops);
|
||||||
void zbud_destroy_pool(struct zbud_pool *pool);
|
void zbud_destroy_pool(struct zbud_pool *pool);
|
||||||
int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp,
|
int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp,
|
||||||
unsigned long *handle);
|
unsigned long *handle);
|
||||||
|
|
|
@ -37,7 +37,7 @@ enum zpool_mapmode {
|
||||||
};
|
};
|
||||||
|
|
||||||
struct zpool *zpool_create_pool(char *type, char *name,
|
struct zpool *zpool_create_pool(char *type, char *name,
|
||||||
gfp_t gfp, struct zpool_ops *ops);
|
gfp_t gfp, const struct zpool_ops *ops);
|
||||||
|
|
||||||
char *zpool_get_type(struct zpool *pool);
|
char *zpool_get_type(struct zpool *pool);
|
||||||
|
|
||||||
|
@ -81,7 +81,7 @@ struct zpool_driver {
|
||||||
atomic_t refcount;
|
atomic_t refcount;
|
||||||
struct list_head list;
|
struct list_head list;
|
||||||
|
|
||||||
void *(*create)(char *name, gfp_t gfp, struct zpool_ops *ops,
|
void *(*create)(char *name, gfp_t gfp, const struct zpool_ops *ops,
|
||||||
struct zpool *zpool);
|
struct zpool *zpool);
|
||||||
void (*destroy)(void *pool);
|
void (*destroy)(void *pool);
|
||||||
|
|
||||||
|
|
|
@ -34,6 +34,11 @@ enum zs_mapmode {
|
||||||
*/
|
*/
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct zs_pool_stats {
|
||||||
|
/* How many pages were migrated (freed) */
|
||||||
|
unsigned long pages_compacted;
|
||||||
|
};
|
||||||
|
|
||||||
struct zs_pool;
|
struct zs_pool;
|
||||||
|
|
||||||
struct zs_pool *zs_create_pool(char *name, gfp_t flags);
|
struct zs_pool *zs_create_pool(char *name, gfp_t flags);
|
||||||
|
@ -49,4 +54,5 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle);
|
||||||
unsigned long zs_get_total_pages(struct zs_pool *pool);
|
unsigned long zs_get_total_pages(struct zs_pool *pool);
|
||||||
unsigned long zs_compact(struct zs_pool *pool);
|
unsigned long zs_compact(struct zs_pool *pool);
|
||||||
|
|
||||||
|
void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats);
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -1042,42 +1042,9 @@ struct proto {
|
||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
/*
|
|
||||||
* Bits in struct cg_proto.flags
|
|
||||||
*/
|
|
||||||
enum cg_proto_flags {
|
|
||||||
/* Currently active and new sockets should be assigned to cgroups */
|
|
||||||
MEMCG_SOCK_ACTIVE,
|
|
||||||
/* It was ever activated; we must disarm static keys on destruction */
|
|
||||||
MEMCG_SOCK_ACTIVATED,
|
|
||||||
};
|
|
||||||
|
|
||||||
struct cg_proto {
|
|
||||||
struct page_counter memory_allocated; /* Current allocated memory. */
|
|
||||||
struct percpu_counter sockets_allocated; /* Current number of sockets. */
|
|
||||||
int memory_pressure;
|
|
||||||
long sysctl_mem[3];
|
|
||||||
unsigned long flags;
|
|
||||||
/*
|
|
||||||
* memcg field is used to find which memcg we belong directly
|
|
||||||
* Each memcg struct can hold more than one cg_proto, so container_of
|
|
||||||
* won't really cut.
|
|
||||||
*
|
|
||||||
* The elegant solution would be having an inverse function to
|
|
||||||
* proto_cgroup in struct proto, but that means polluting the structure
|
|
||||||
* for everybody, instead of just for memcg users.
|
|
||||||
*/
|
|
||||||
struct mem_cgroup *memcg;
|
|
||||||
};
|
|
||||||
|
|
||||||
int proto_register(struct proto *prot, int alloc_slab);
|
int proto_register(struct proto *prot, int alloc_slab);
|
||||||
void proto_unregister(struct proto *prot);
|
void proto_unregister(struct proto *prot);
|
||||||
|
|
||||||
static inline bool memcg_proto_active(struct cg_proto *cg_proto)
|
|
||||||
{
|
|
||||||
return test_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags);
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef SOCK_REFCNT_DEBUG
|
#ifdef SOCK_REFCNT_DEBUG
|
||||||
static inline void sk_refcnt_debug_inc(struct sock *sk)
|
static inline void sk_refcnt_debug_inc(struct sock *sk)
|
||||||
{
|
{
|
||||||
|
|
|
@ -1342,7 +1342,7 @@ static int cgroup_show_options(struct seq_file *seq,
|
||||||
if (root != &cgrp_dfl_root)
|
if (root != &cgrp_dfl_root)
|
||||||
for_each_subsys(ss, ssid)
|
for_each_subsys(ss, ssid)
|
||||||
if (root->subsys_mask & (1 << ssid))
|
if (root->subsys_mask & (1 << ssid))
|
||||||
seq_show_option(seq, ss->name, NULL);
|
seq_show_option(seq, ss->legacy_name, NULL);
|
||||||
if (root->flags & CGRP_ROOT_NOPREFIX)
|
if (root->flags & CGRP_ROOT_NOPREFIX)
|
||||||
seq_puts(seq, ",noprefix");
|
seq_puts(seq, ",noprefix");
|
||||||
if (root->flags & CGRP_ROOT_XATTR)
|
if (root->flags & CGRP_ROOT_XATTR)
|
||||||
|
|
|
@ -339,7 +339,7 @@ static int profile_cpu_callback(struct notifier_block *info,
|
||||||
node = cpu_to_mem(cpu);
|
node = cpu_to_mem(cpu);
|
||||||
per_cpu(cpu_profile_flip, cpu) = 0;
|
per_cpu(cpu_profile_flip, cpu) = 0;
|
||||||
if (!per_cpu(cpu_profile_hits, cpu)[1]) {
|
if (!per_cpu(cpu_profile_hits, cpu)[1]) {
|
||||||
page = alloc_pages_exact_node(node,
|
page = __alloc_pages_node(node,
|
||||||
GFP_KERNEL | __GFP_ZERO,
|
GFP_KERNEL | __GFP_ZERO,
|
||||||
0);
|
0);
|
||||||
if (!page)
|
if (!page)
|
||||||
|
@ -347,7 +347,7 @@ static int profile_cpu_callback(struct notifier_block *info,
|
||||||
per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
|
per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
|
||||||
}
|
}
|
||||||
if (!per_cpu(cpu_profile_hits, cpu)[0]) {
|
if (!per_cpu(cpu_profile_hits, cpu)[0]) {
|
||||||
page = alloc_pages_exact_node(node,
|
page = __alloc_pages_node(node,
|
||||||
GFP_KERNEL | __GFP_ZERO,
|
GFP_KERNEL | __GFP_ZERO,
|
||||||
0);
|
0);
|
||||||
if (!page)
|
if (!page)
|
||||||
|
@ -543,14 +543,14 @@ static int create_hash_tables(void)
|
||||||
int node = cpu_to_mem(cpu);
|
int node = cpu_to_mem(cpu);
|
||||||
struct page *page;
|
struct page *page;
|
||||||
|
|
||||||
page = alloc_pages_exact_node(node,
|
page = __alloc_pages_node(node,
|
||||||
GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
|
GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
|
||||||
0);
|
0);
|
||||||
if (!page)
|
if (!page)
|
||||||
goto out_cleanup;
|
goto out_cleanup;
|
||||||
per_cpu(cpu_profile_hits, cpu)[1]
|
per_cpu(cpu_profile_hits, cpu)[1]
|
||||||
= (struct profile_hit *)page_address(page);
|
= (struct profile_hit *)page_address(page);
|
||||||
page = alloc_pages_exact_node(node,
|
page = __alloc_pages_node(node,
|
||||||
GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
|
GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
|
||||||
0);
|
0);
|
||||||
if (!page)
|
if (!page)
|
||||||
|
|
|
@ -38,11 +38,9 @@ void show_mem(unsigned int filter)
|
||||||
|
|
||||||
printk("%lu pages RAM\n", total);
|
printk("%lu pages RAM\n", total);
|
||||||
printk("%lu pages HighMem/MovableOnly\n", highmem);
|
printk("%lu pages HighMem/MovableOnly\n", highmem);
|
||||||
#ifdef CONFIG_CMA
|
|
||||||
printk("%lu pages reserved\n", (reserved - totalcma_pages));
|
|
||||||
printk("%lu pages cma reserved\n", totalcma_pages);
|
|
||||||
#else
|
|
||||||
printk("%lu pages reserved\n", reserved);
|
printk("%lu pages reserved\n", reserved);
|
||||||
|
#ifdef CONFIG_CMA
|
||||||
|
printk("%lu pages cma reserved\n", totalcma_pages);
|
||||||
#endif
|
#endif
|
||||||
#ifdef CONFIG_QUICKLIST
|
#ifdef CONFIG_QUICKLIST
|
||||||
printk("%lu pages in pagetable cache\n",
|
printk("%lu pages in pagetable cache\n",
|
||||||
|
|
|
@ -236,6 +236,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
|
||||||
count += pages;
|
count += pages;
|
||||||
while (pages--)
|
while (pages--)
|
||||||
__free_pages_bootmem(page++, cur++, 0);
|
__free_pages_bootmem(page++, cur++, 0);
|
||||||
|
bdata->node_bootmem_map = NULL;
|
||||||
|
|
||||||
bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count);
|
bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count);
|
||||||
|
|
||||||
|
@ -294,6 +295,9 @@ static void __init __free(bootmem_data_t *bdata,
|
||||||
sidx + bdata->node_min_pfn,
|
sidx + bdata->node_min_pfn,
|
||||||
eidx + bdata->node_min_pfn);
|
eidx + bdata->node_min_pfn);
|
||||||
|
|
||||||
|
if (WARN_ON(bdata->node_bootmem_map == NULL))
|
||||||
|
return;
|
||||||
|
|
||||||
if (bdata->hint_idx > sidx)
|
if (bdata->hint_idx > sidx)
|
||||||
bdata->hint_idx = sidx;
|
bdata->hint_idx = sidx;
|
||||||
|
|
||||||
|
@ -314,6 +318,9 @@ static int __init __reserve(bootmem_data_t *bdata, unsigned long sidx,
|
||||||
eidx + bdata->node_min_pfn,
|
eidx + bdata->node_min_pfn,
|
||||||
flags);
|
flags);
|
||||||
|
|
||||||
|
if (WARN_ON(bdata->node_bootmem_map == NULL))
|
||||||
|
return 0;
|
||||||
|
|
||||||
for (idx = sidx; idx < eidx; idx++)
|
for (idx = sidx; idx < eidx; idx++)
|
||||||
if (test_and_set_bit(idx, bdata->node_bootmem_map)) {
|
if (test_and_set_bit(idx, bdata->node_bootmem_map)) {
|
||||||
if (exclusive) {
|
if (exclusive) {
|
||||||
|
|
175
mm/compaction.c
175
mm/compaction.c
|
@ -207,6 +207,13 @@ static inline bool isolation_suitable(struct compact_control *cc,
|
||||||
return !get_pageblock_skip(page);
|
return !get_pageblock_skip(page);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void reset_cached_positions(struct zone *zone)
|
||||||
|
{
|
||||||
|
zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
|
||||||
|
zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
|
||||||
|
zone->compact_cached_free_pfn = zone_end_pfn(zone);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* This function is called to clear all cached information on pageblocks that
|
* This function is called to clear all cached information on pageblocks that
|
||||||
* should be skipped for page isolation when the migrate and free page scanner
|
* should be skipped for page isolation when the migrate and free page scanner
|
||||||
|
@ -218,9 +225,6 @@ static void __reset_isolation_suitable(struct zone *zone)
|
||||||
unsigned long end_pfn = zone_end_pfn(zone);
|
unsigned long end_pfn = zone_end_pfn(zone);
|
||||||
unsigned long pfn;
|
unsigned long pfn;
|
||||||
|
|
||||||
zone->compact_cached_migrate_pfn[0] = start_pfn;
|
|
||||||
zone->compact_cached_migrate_pfn[1] = start_pfn;
|
|
||||||
zone->compact_cached_free_pfn = end_pfn;
|
|
||||||
zone->compact_blockskip_flush = false;
|
zone->compact_blockskip_flush = false;
|
||||||
|
|
||||||
/* Walk the zone and mark every pageblock as suitable for isolation */
|
/* Walk the zone and mark every pageblock as suitable for isolation */
|
||||||
|
@ -238,6 +242,8 @@ static void __reset_isolation_suitable(struct zone *zone)
|
||||||
|
|
||||||
clear_pageblock_skip(page);
|
clear_pageblock_skip(page);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
reset_cached_positions(zone);
|
||||||
}
|
}
|
||||||
|
|
||||||
void reset_isolation_suitable(pg_data_t *pgdat)
|
void reset_isolation_suitable(pg_data_t *pgdat)
|
||||||
|
@ -431,6 +437,24 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
|
||||||
|
|
||||||
if (!valid_page)
|
if (!valid_page)
|
||||||
valid_page = page;
|
valid_page = page;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* For compound pages such as THP and hugetlbfs, we can save
|
||||||
|
* potentially a lot of iterations if we skip them at once.
|
||||||
|
* The check is racy, but we can consider only valid values
|
||||||
|
* and the only danger is skipping too much.
|
||||||
|
*/
|
||||||
|
if (PageCompound(page)) {
|
||||||
|
unsigned int comp_order = compound_order(page);
|
||||||
|
|
||||||
|
if (likely(comp_order < MAX_ORDER)) {
|
||||||
|
blockpfn += (1UL << comp_order) - 1;
|
||||||
|
cursor += (1UL << comp_order) - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
goto isolate_fail;
|
||||||
|
}
|
||||||
|
|
||||||
if (!PageBuddy(page))
|
if (!PageBuddy(page))
|
||||||
goto isolate_fail;
|
goto isolate_fail;
|
||||||
|
|
||||||
|
@ -490,6 +514,13 @@ isolate_fail:
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* There is a tiny chance that we have read bogus compound_order(),
|
||||||
|
* so be careful to not go outside of the pageblock.
|
||||||
|
*/
|
||||||
|
if (unlikely(blockpfn > end_pfn))
|
||||||
|
blockpfn = end_pfn;
|
||||||
|
|
||||||
trace_mm_compaction_isolate_freepages(*start_pfn, blockpfn,
|
trace_mm_compaction_isolate_freepages(*start_pfn, blockpfn,
|
||||||
nr_scanned, total_isolated);
|
nr_scanned, total_isolated);
|
||||||
|
|
||||||
|
@ -674,6 +705,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
|
||||||
|
|
||||||
/* Time to isolate some pages for migration */
|
/* Time to isolate some pages for migration */
|
||||||
for (; low_pfn < end_pfn; low_pfn++) {
|
for (; low_pfn < end_pfn; low_pfn++) {
|
||||||
|
bool is_lru;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Periodically drop the lock (if held) regardless of its
|
* Periodically drop the lock (if held) regardless of its
|
||||||
* contention, to give chance to IRQs. Abort async compaction
|
* contention, to give chance to IRQs. Abort async compaction
|
||||||
|
@ -717,36 +750,35 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
|
||||||
* It's possible to migrate LRU pages and balloon pages
|
* It's possible to migrate LRU pages and balloon pages
|
||||||
* Skip any other type of page
|
* Skip any other type of page
|
||||||
*/
|
*/
|
||||||
if (!PageLRU(page)) {
|
is_lru = PageLRU(page);
|
||||||
|
if (!is_lru) {
|
||||||
if (unlikely(balloon_page_movable(page))) {
|
if (unlikely(balloon_page_movable(page))) {
|
||||||
if (balloon_page_isolate(page)) {
|
if (balloon_page_isolate(page)) {
|
||||||
/* Successfully isolated */
|
/* Successfully isolated */
|
||||||
goto isolate_success;
|
goto isolate_success;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
continue;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* PageLRU is set. lru_lock normally excludes isolation
|
* Regardless of being on LRU, compound pages such as THP and
|
||||||
* splitting and collapsing (collapsing has already happened
|
* hugetlbfs are not to be compacted. We can potentially save
|
||||||
* if PageLRU is set) but the lock is not necessarily taken
|
* a lot of iterations if we skip them at once. The check is
|
||||||
* here and it is wasteful to take it just to check transhuge.
|
* racy, but we can consider only valid values and the only
|
||||||
* Check TransHuge without lock and skip the whole pageblock if
|
* danger is skipping too much.
|
||||||
* it's either a transhuge or hugetlbfs page, as calling
|
|
||||||
* compound_order() without preventing THP from splitting the
|
|
||||||
* page underneath us may return surprising results.
|
|
||||||
*/
|
*/
|
||||||
if (PageTransHuge(page)) {
|
if (PageCompound(page)) {
|
||||||
if (!locked)
|
unsigned int comp_order = compound_order(page);
|
||||||
low_pfn = ALIGN(low_pfn + 1,
|
|
||||||
pageblock_nr_pages) - 1;
|
if (likely(comp_order < MAX_ORDER))
|
||||||
else
|
low_pfn += (1UL << comp_order) - 1;
|
||||||
low_pfn += (1 << compound_order(page)) - 1;
|
|
||||||
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!is_lru)
|
||||||
|
continue;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Migration will fail if an anonymous page is pinned in memory,
|
* Migration will fail if an anonymous page is pinned in memory,
|
||||||
* so avoid taking lru_lock and isolating it unnecessarily in an
|
* so avoid taking lru_lock and isolating it unnecessarily in an
|
||||||
|
@ -763,11 +795,17 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
|
||||||
if (!locked)
|
if (!locked)
|
||||||
break;
|
break;
|
||||||
|
|
||||||
/* Recheck PageLRU and PageTransHuge under lock */
|
/* Recheck PageLRU and PageCompound under lock */
|
||||||
if (!PageLRU(page))
|
if (!PageLRU(page))
|
||||||
continue;
|
continue;
|
||||||
if (PageTransHuge(page)) {
|
|
||||||
low_pfn += (1 << compound_order(page)) - 1;
|
/*
|
||||||
|
* Page become compound since the non-locked check,
|
||||||
|
* and it's on LRU. It can only be a THP so the order
|
||||||
|
* is safe to read and it's 0 for tail pages.
|
||||||
|
*/
|
||||||
|
if (unlikely(PageCompound(page))) {
|
||||||
|
low_pfn += (1UL << compound_order(page)) - 1;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -778,7 +816,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
|
||||||
if (__isolate_lru_page(page, isolate_mode) != 0)
|
if (__isolate_lru_page(page, isolate_mode) != 0)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
VM_BUG_ON_PAGE(PageTransCompound(page), page);
|
VM_BUG_ON_PAGE(PageCompound(page), page);
|
||||||
|
|
||||||
/* Successfully isolated */
|
/* Successfully isolated */
|
||||||
del_page_from_lru_list(page, lruvec, page_lru(page));
|
del_page_from_lru_list(page, lruvec, page_lru(page));
|
||||||
|
@ -897,6 +935,16 @@ static bool suitable_migration_target(struct page *page)
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Test whether the free scanner has reached the same or lower pageblock than
|
||||||
|
* the migration scanner, and compaction should thus terminate.
|
||||||
|
*/
|
||||||
|
static inline bool compact_scanners_met(struct compact_control *cc)
|
||||||
|
{
|
||||||
|
return (cc->free_pfn >> pageblock_order)
|
||||||
|
<= (cc->migrate_pfn >> pageblock_order);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Based on information in the current compact_control, find blocks
|
* Based on information in the current compact_control, find blocks
|
||||||
* suitable for isolating free pages from and then isolate them.
|
* suitable for isolating free pages from and then isolate them.
|
||||||
|
@ -933,8 +981,7 @@ static void isolate_freepages(struct compact_control *cc)
|
||||||
* pages on cc->migratepages. We stop searching if the migrate
|
* pages on cc->migratepages. We stop searching if the migrate
|
||||||
* and free page scanners meet or enough free pages are isolated.
|
* and free page scanners meet or enough free pages are isolated.
|
||||||
*/
|
*/
|
||||||
for (; block_start_pfn >= low_pfn &&
|
for (; block_start_pfn >= low_pfn;
|
||||||
cc->nr_migratepages > cc->nr_freepages;
|
|
||||||
block_end_pfn = block_start_pfn,
|
block_end_pfn = block_start_pfn,
|
||||||
block_start_pfn -= pageblock_nr_pages,
|
block_start_pfn -= pageblock_nr_pages,
|
||||||
isolate_start_pfn = block_start_pfn) {
|
isolate_start_pfn = block_start_pfn) {
|
||||||
|
@ -966,6 +1013,8 @@ static void isolate_freepages(struct compact_control *cc)
|
||||||
block_end_pfn, freelist, false);
|
block_end_pfn, freelist, false);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
* If we isolated enough freepages, or aborted due to async
|
||||||
|
* compaction being contended, terminate the loop.
|
||||||
* Remember where the free scanner should restart next time,
|
* Remember where the free scanner should restart next time,
|
||||||
* which is where isolate_freepages_block() left off.
|
* which is where isolate_freepages_block() left off.
|
||||||
* But if it scanned the whole pageblock, isolate_start_pfn
|
* But if it scanned the whole pageblock, isolate_start_pfn
|
||||||
|
@ -974,27 +1023,31 @@ static void isolate_freepages(struct compact_control *cc)
|
||||||
* In that case we will however want to restart at the start
|
* In that case we will however want to restart at the start
|
||||||
* of the previous pageblock.
|
* of the previous pageblock.
|
||||||
*/
|
*/
|
||||||
cc->free_pfn = (isolate_start_pfn < block_end_pfn) ?
|
if ((cc->nr_freepages >= cc->nr_migratepages)
|
||||||
isolate_start_pfn :
|
|| cc->contended) {
|
||||||
block_start_pfn - pageblock_nr_pages;
|
if (isolate_start_pfn >= block_end_pfn)
|
||||||
|
isolate_start_pfn =
|
||||||
/*
|
block_start_pfn - pageblock_nr_pages;
|
||||||
* isolate_freepages_block() might have aborted due to async
|
|
||||||
* compaction being contended
|
|
||||||
*/
|
|
||||||
if (cc->contended)
|
|
||||||
break;
|
break;
|
||||||
|
} else {
|
||||||
|
/*
|
||||||
|
* isolate_freepages_block() should not terminate
|
||||||
|
* prematurely unless contended, or isolated enough
|
||||||
|
*/
|
||||||
|
VM_BUG_ON(isolate_start_pfn < block_end_pfn);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* split_free_page does not map the pages */
|
/* split_free_page does not map the pages */
|
||||||
map_pages(freelist);
|
map_pages(freelist);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If we crossed the migrate scanner, we want to keep it that way
|
* Record where the free scanner will restart next time. Either we
|
||||||
* so that compact_finished() may detect this
|
* broke from the loop and set isolate_start_pfn based on the last
|
||||||
|
* call to isolate_freepages_block(), or we met the migration scanner
|
||||||
|
* and the loop terminated due to isolate_start_pfn < low_pfn
|
||||||
*/
|
*/
|
||||||
if (block_start_pfn < low_pfn)
|
cc->free_pfn = isolate_start_pfn;
|
||||||
cc->free_pfn = cc->migrate_pfn;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -1062,6 +1115,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
|
||||||
struct compact_control *cc)
|
struct compact_control *cc)
|
||||||
{
|
{
|
||||||
unsigned long low_pfn, end_pfn;
|
unsigned long low_pfn, end_pfn;
|
||||||
|
unsigned long isolate_start_pfn;
|
||||||
struct page *page;
|
struct page *page;
|
||||||
const isolate_mode_t isolate_mode =
|
const isolate_mode_t isolate_mode =
|
||||||
(sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) |
|
(sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) |
|
||||||
|
@ -1110,6 +1164,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
/* Perform the isolation */
|
/* Perform the isolation */
|
||||||
|
isolate_start_pfn = low_pfn;
|
||||||
low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn,
|
low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn,
|
||||||
isolate_mode);
|
isolate_mode);
|
||||||
|
|
||||||
|
@ -1118,6 +1173,15 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
|
||||||
return ISOLATE_ABORT;
|
return ISOLATE_ABORT;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Record where we could have freed pages by migration and not
|
||||||
|
* yet flushed them to buddy allocator.
|
||||||
|
* - this is the lowest page that could have been isolated and
|
||||||
|
* then freed by migration.
|
||||||
|
*/
|
||||||
|
if (cc->nr_migratepages && !cc->last_migrated_pfn)
|
||||||
|
cc->last_migrated_pfn = isolate_start_pfn;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Either we isolated something and proceed with migration. Or
|
* Either we isolated something and proceed with migration. Or
|
||||||
* we failed and compact_zone should decide if we should
|
* we failed and compact_zone should decide if we should
|
||||||
|
@ -1127,12 +1191,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
|
||||||
}
|
}
|
||||||
|
|
||||||
acct_isolated(zone, cc);
|
acct_isolated(zone, cc);
|
||||||
/*
|
/* Record where migration scanner will be restarted. */
|
||||||
* Record where migration scanner will be restarted. If we end up in
|
cc->migrate_pfn = low_pfn;
|
||||||
* the same pageblock as the free scanner, make the scanners fully
|
|
||||||
* meet so that compact_finished() terminates compaction.
|
|
||||||
*/
|
|
||||||
cc->migrate_pfn = (end_pfn <= cc->free_pfn) ? low_pfn : cc->free_pfn;
|
|
||||||
|
|
||||||
return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
|
return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
|
||||||
}
|
}
|
||||||
|
@ -1147,11 +1207,9 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc,
|
||||||
return COMPACT_PARTIAL;
|
return COMPACT_PARTIAL;
|
||||||
|
|
||||||
/* Compaction run completes if the migrate and free scanner meet */
|
/* Compaction run completes if the migrate and free scanner meet */
|
||||||
if (cc->free_pfn <= cc->migrate_pfn) {
|
if (compact_scanners_met(cc)) {
|
||||||
/* Let the next compaction start anew. */
|
/* Let the next compaction start anew. */
|
||||||
zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
|
reset_cached_positions(zone);
|
||||||
zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
|
|
||||||
zone->compact_cached_free_pfn = zone_end_pfn(zone);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Mark that the PG_migrate_skip information should be cleared
|
* Mark that the PG_migrate_skip information should be cleared
|
||||||
|
@ -1295,7 +1353,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
|
||||||
unsigned long end_pfn = zone_end_pfn(zone);
|
unsigned long end_pfn = zone_end_pfn(zone);
|
||||||
const int migratetype = gfpflags_to_migratetype(cc->gfp_mask);
|
const int migratetype = gfpflags_to_migratetype(cc->gfp_mask);
|
||||||
const bool sync = cc->mode != MIGRATE_ASYNC;
|
const bool sync = cc->mode != MIGRATE_ASYNC;
|
||||||
unsigned long last_migrated_pfn = 0;
|
|
||||||
|
|
||||||
ret = compaction_suitable(zone, cc->order, cc->alloc_flags,
|
ret = compaction_suitable(zone, cc->order, cc->alloc_flags,
|
||||||
cc->classzone_idx);
|
cc->classzone_idx);
|
||||||
|
@ -1333,6 +1390,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
|
||||||
zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
|
zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
|
||||||
zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
|
zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
|
||||||
}
|
}
|
||||||
|
cc->last_migrated_pfn = 0;
|
||||||
|
|
||||||
trace_mm_compaction_begin(start_pfn, cc->migrate_pfn,
|
trace_mm_compaction_begin(start_pfn, cc->migrate_pfn,
|
||||||
cc->free_pfn, end_pfn, sync);
|
cc->free_pfn, end_pfn, sync);
|
||||||
|
@ -1342,7 +1400,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
|
||||||
while ((ret = compact_finished(zone, cc, migratetype)) ==
|
while ((ret = compact_finished(zone, cc, migratetype)) ==
|
||||||
COMPACT_CONTINUE) {
|
COMPACT_CONTINUE) {
|
||||||
int err;
|
int err;
|
||||||
unsigned long isolate_start_pfn = cc->migrate_pfn;
|
|
||||||
|
|
||||||
switch (isolate_migratepages(zone, cc)) {
|
switch (isolate_migratepages(zone, cc)) {
|
||||||
case ISOLATE_ABORT:
|
case ISOLATE_ABORT:
|
||||||
|
@ -1376,22 +1433,12 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
|
||||||
* migrate_pages() may return -ENOMEM when scanners meet
|
* migrate_pages() may return -ENOMEM when scanners meet
|
||||||
* and we want compact_finished() to detect it
|
* and we want compact_finished() to detect it
|
||||||
*/
|
*/
|
||||||
if (err == -ENOMEM && cc->free_pfn > cc->migrate_pfn) {
|
if (err == -ENOMEM && !compact_scanners_met(cc)) {
|
||||||
ret = COMPACT_PARTIAL;
|
ret = COMPACT_PARTIAL;
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* Record where we could have freed pages by migration and not
|
|
||||||
* yet flushed them to buddy allocator. We use the pfn that
|
|
||||||
* isolate_migratepages() started from in this loop iteration
|
|
||||||
* - this is the lowest page that could have been isolated and
|
|
||||||
* then freed by migration.
|
|
||||||
*/
|
|
||||||
if (!last_migrated_pfn)
|
|
||||||
last_migrated_pfn = isolate_start_pfn;
|
|
||||||
|
|
||||||
check_drain:
|
check_drain:
|
||||||
/*
|
/*
|
||||||
* Has the migration scanner moved away from the previous
|
* Has the migration scanner moved away from the previous
|
||||||
|
@ -1400,18 +1447,18 @@ check_drain:
|
||||||
* compact_finished() can detect immediately if allocation
|
* compact_finished() can detect immediately if allocation
|
||||||
* would succeed.
|
* would succeed.
|
||||||
*/
|
*/
|
||||||
if (cc->order > 0 && last_migrated_pfn) {
|
if (cc->order > 0 && cc->last_migrated_pfn) {
|
||||||
int cpu;
|
int cpu;
|
||||||
unsigned long current_block_start =
|
unsigned long current_block_start =
|
||||||
cc->migrate_pfn & ~((1UL << cc->order) - 1);
|
cc->migrate_pfn & ~((1UL << cc->order) - 1);
|
||||||
|
|
||||||
if (last_migrated_pfn < current_block_start) {
|
if (cc->last_migrated_pfn < current_block_start) {
|
||||||
cpu = get_cpu();
|
cpu = get_cpu();
|
||||||
lru_add_drain_cpu(cpu);
|
lru_add_drain_cpu(cpu);
|
||||||
drain_local_pages(zone);
|
drain_local_pages(zone);
|
||||||
put_cpu();
|
put_cpu();
|
||||||
/* No more flushing until we migrate again */
|
/* No more flushing until we migrate again */
|
||||||
last_migrated_pfn = 0;
|
cc->last_migrated_pfn = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
12
mm/dmapool.c
12
mm/dmapool.c
|
@ -271,6 +271,9 @@ void dma_pool_destroy(struct dma_pool *pool)
|
||||||
{
|
{
|
||||||
bool empty = false;
|
bool empty = false;
|
||||||
|
|
||||||
|
if (unlikely(!pool))
|
||||||
|
return;
|
||||||
|
|
||||||
mutex_lock(&pools_reg_lock);
|
mutex_lock(&pools_reg_lock);
|
||||||
mutex_lock(&pools_lock);
|
mutex_lock(&pools_lock);
|
||||||
list_del(&pool->pools);
|
list_del(&pool->pools);
|
||||||
|
@ -334,7 +337,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
|
||||||
/* pool_alloc_page() might sleep, so temporarily drop &pool->lock */
|
/* pool_alloc_page() might sleep, so temporarily drop &pool->lock */
|
||||||
spin_unlock_irqrestore(&pool->lock, flags);
|
spin_unlock_irqrestore(&pool->lock, flags);
|
||||||
|
|
||||||
page = pool_alloc_page(pool, mem_flags);
|
page = pool_alloc_page(pool, mem_flags & (~__GFP_ZERO));
|
||||||
if (!page)
|
if (!page)
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
|
@ -372,9 +375,14 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
memset(retval, POOL_POISON_ALLOCATED, pool->size);
|
if (!(mem_flags & __GFP_ZERO))
|
||||||
|
memset(retval, POOL_POISON_ALLOCATED, pool->size);
|
||||||
#endif
|
#endif
|
||||||
spin_unlock_irqrestore(&pool->lock, flags);
|
spin_unlock_irqrestore(&pool->lock, flags);
|
||||||
|
|
||||||
|
if (mem_flags & __GFP_ZERO)
|
||||||
|
memset(retval, 0, pool->size);
|
||||||
|
|
||||||
return retval;
|
return retval;
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL(dma_pool_alloc);
|
EXPORT_SYMBOL(dma_pool_alloc);
|
||||||
|
|
|
@ -224,6 +224,28 @@ early_memremap_ro(resource_size_t phys_addr, unsigned long size)
|
||||||
return (__force void *)__early_ioremap(phys_addr, size, FIXMAP_PAGE_RO);
|
return (__force void *)__early_ioremap(phys_addr, size, FIXMAP_PAGE_RO);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
|
||||||
|
|
||||||
|
void __init copy_from_early_mem(void *dest, phys_addr_t src, unsigned long size)
|
||||||
|
{
|
||||||
|
unsigned long slop, clen;
|
||||||
|
char *p;
|
||||||
|
|
||||||
|
while (size) {
|
||||||
|
slop = src & ~PAGE_MASK;
|
||||||
|
clen = size;
|
||||||
|
if (clen > MAX_MAP_CHUNK - slop)
|
||||||
|
clen = MAX_MAP_CHUNK - slop;
|
||||||
|
p = early_memremap(src & PAGE_MASK, clen + slop);
|
||||||
|
memcpy(dest, p + slop, clen);
|
||||||
|
early_memunmap(p, clen + slop);
|
||||||
|
dest += clen;
|
||||||
|
src += clen;
|
||||||
|
size -= clen;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#else /* CONFIG_MMU */
|
#else /* CONFIG_MMU */
|
||||||
|
|
||||||
void __init __iomem *
|
void __init __iomem *
|
||||||
|
|
36
mm/filemap.c
36
mm/filemap.c
|
@ -674,7 +674,7 @@ struct page *__page_cache_alloc(gfp_t gfp)
|
||||||
do {
|
do {
|
||||||
cpuset_mems_cookie = read_mems_allowed_begin();
|
cpuset_mems_cookie = read_mems_allowed_begin();
|
||||||
n = cpuset_mem_spread_node();
|
n = cpuset_mem_spread_node();
|
||||||
page = alloc_pages_exact_node(n, gfp, 0);
|
page = __alloc_pages_node(n, gfp, 0);
|
||||||
} while (!page && read_mems_allowed_retry(cpuset_mems_cookie));
|
} while (!page && read_mems_allowed_retry(cpuset_mems_cookie));
|
||||||
|
|
||||||
return page;
|
return page;
|
||||||
|
@ -2473,21 +2473,6 @@ ssize_t generic_perform_write(struct file *file,
|
||||||
iov_iter_count(i));
|
iov_iter_count(i));
|
||||||
|
|
||||||
again:
|
again:
|
||||||
/*
|
|
||||||
* Bring in the user page that we will copy from _first_.
|
|
||||||
* Otherwise there's a nasty deadlock on copying from the
|
|
||||||
* same page as we're writing to, without it being marked
|
|
||||||
* up-to-date.
|
|
||||||
*
|
|
||||||
* Not only is this an optimisation, but it is also required
|
|
||||||
* to check that the address is actually valid, when atomic
|
|
||||||
* usercopies are used, below.
|
|
||||||
*/
|
|
||||||
if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
|
|
||||||
status = -EFAULT;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
status = a_ops->write_begin(file, mapping, pos, bytes, flags,
|
status = a_ops->write_begin(file, mapping, pos, bytes, flags,
|
||||||
&page, &fsdata);
|
&page, &fsdata);
|
||||||
if (unlikely(status < 0))
|
if (unlikely(status < 0))
|
||||||
|
@ -2495,8 +2480,17 @@ again:
|
||||||
|
|
||||||
if (mapping_writably_mapped(mapping))
|
if (mapping_writably_mapped(mapping))
|
||||||
flush_dcache_page(page);
|
flush_dcache_page(page);
|
||||||
|
/*
|
||||||
|
* 'page' is now locked. If we are trying to copy from a
|
||||||
|
* mapping of 'page' in userspace, the copy might fault and
|
||||||
|
* would need PageUptodate() to complete. But, page can not be
|
||||||
|
* made Uptodate without acquiring the page lock, which we hold.
|
||||||
|
* Deadlock. Avoid with pagefault_disable(). Fix up below with
|
||||||
|
* iov_iter_fault_in_readable().
|
||||||
|
*/
|
||||||
|
pagefault_disable();
|
||||||
copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
|
copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
|
||||||
|
pagefault_enable();
|
||||||
flush_dcache_page(page);
|
flush_dcache_page(page);
|
||||||
|
|
||||||
status = a_ops->write_end(file, mapping, pos, bytes, copied,
|
status = a_ops->write_end(file, mapping, pos, bytes, copied,
|
||||||
|
@ -2519,6 +2513,14 @@ again:
|
||||||
*/
|
*/
|
||||||
bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
|
bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
|
||||||
iov_iter_single_seg_count(i));
|
iov_iter_single_seg_count(i));
|
||||||
|
/*
|
||||||
|
* This is the fallback to recover if the copy from
|
||||||
|
* userspace above faults.
|
||||||
|
*/
|
||||||
|
if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
|
||||||
|
status = -EFAULT;
|
||||||
|
break;
|
||||||
|
}
|
||||||
goto again;
|
goto again;
|
||||||
}
|
}
|
||||||
pos += copied;
|
pos += copied;
|
||||||
|
|
163
mm/huge_memory.c
163
mm/huge_memory.c
|
@ -16,6 +16,7 @@
|
||||||
#include <linux/swap.h>
|
#include <linux/swap.h>
|
||||||
#include <linux/shrinker.h>
|
#include <linux/shrinker.h>
|
||||||
#include <linux/mm_inline.h>
|
#include <linux/mm_inline.h>
|
||||||
|
#include <linux/dax.h>
|
||||||
#include <linux/kthread.h>
|
#include <linux/kthread.h>
|
||||||
#include <linux/khugepaged.h>
|
#include <linux/khugepaged.h>
|
||||||
#include <linux/freezer.h>
|
#include <linux/freezer.h>
|
||||||
|
@ -105,7 +106,7 @@ static struct khugepaged_scan khugepaged_scan = {
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
static int set_recommended_min_free_kbytes(void)
|
static void set_recommended_min_free_kbytes(void)
|
||||||
{
|
{
|
||||||
struct zone *zone;
|
struct zone *zone;
|
||||||
int nr_zones = 0;
|
int nr_zones = 0;
|
||||||
|
@ -140,7 +141,6 @@ static int set_recommended_min_free_kbytes(void)
|
||||||
min_free_kbytes = recommended_min;
|
min_free_kbytes = recommended_min;
|
||||||
}
|
}
|
||||||
setup_per_zone_wmarks();
|
setup_per_zone_wmarks();
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static int start_stop_khugepaged(void)
|
static int start_stop_khugepaged(void)
|
||||||
|
@ -172,12 +172,7 @@ fail:
|
||||||
static atomic_t huge_zero_refcount;
|
static atomic_t huge_zero_refcount;
|
||||||
struct page *huge_zero_page __read_mostly;
|
struct page *huge_zero_page __read_mostly;
|
||||||
|
|
||||||
static inline bool is_huge_zero_pmd(pmd_t pmd)
|
struct page *get_huge_zero_page(void)
|
||||||
{
|
|
||||||
return is_huge_zero_page(pmd_page(pmd));
|
|
||||||
}
|
|
||||||
|
|
||||||
static struct page *get_huge_zero_page(void)
|
|
||||||
{
|
{
|
||||||
struct page *zero_page;
|
struct page *zero_page;
|
||||||
retry:
|
retry:
|
||||||
|
@ -794,16 +789,19 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Caller must hold page table lock. */
|
/* Caller must hold page table lock. */
|
||||||
static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
|
static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
|
||||||
struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
|
struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
|
||||||
struct page *zero_page)
|
struct page *zero_page)
|
||||||
{
|
{
|
||||||
pmd_t entry;
|
pmd_t entry;
|
||||||
|
if (!pmd_none(*pmd))
|
||||||
|
return false;
|
||||||
entry = mk_pmd(zero_page, vma->vm_page_prot);
|
entry = mk_pmd(zero_page, vma->vm_page_prot);
|
||||||
entry = pmd_mkhuge(entry);
|
entry = pmd_mkhuge(entry);
|
||||||
pgtable_trans_huge_deposit(mm, pmd, pgtable);
|
pgtable_trans_huge_deposit(mm, pmd, pgtable);
|
||||||
set_pmd_at(mm, haddr, pmd, entry);
|
set_pmd_at(mm, haddr, pmd, entry);
|
||||||
atomic_long_inc(&mm->nr_ptes);
|
atomic_long_inc(&mm->nr_ptes);
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||||
|
@ -870,6 +868,49 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||||
flags);
|
flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
|
||||||
|
pmd_t *pmd, unsigned long pfn, pgprot_t prot, bool write)
|
||||||
|
{
|
||||||
|
struct mm_struct *mm = vma->vm_mm;
|
||||||
|
pmd_t entry;
|
||||||
|
spinlock_t *ptl;
|
||||||
|
|
||||||
|
ptl = pmd_lock(mm, pmd);
|
||||||
|
if (pmd_none(*pmd)) {
|
||||||
|
entry = pmd_mkhuge(pfn_pmd(pfn, prot));
|
||||||
|
if (write) {
|
||||||
|
entry = pmd_mkyoung(pmd_mkdirty(entry));
|
||||||
|
entry = maybe_pmd_mkwrite(entry, vma);
|
||||||
|
}
|
||||||
|
set_pmd_at(mm, addr, pmd, entry);
|
||||||
|
update_mmu_cache_pmd(vma, addr, pmd);
|
||||||
|
}
|
||||||
|
spin_unlock(ptl);
|
||||||
|
}
|
||||||
|
|
||||||
|
int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
|
||||||
|
pmd_t *pmd, unsigned long pfn, bool write)
|
||||||
|
{
|
||||||
|
pgprot_t pgprot = vma->vm_page_prot;
|
||||||
|
/*
|
||||||
|
* If we had pmd_special, we could avoid all these restrictions,
|
||||||
|
* but we need to be consistent with PTEs and architectures that
|
||||||
|
* can't support a 'special' bit.
|
||||||
|
*/
|
||||||
|
BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
|
||||||
|
BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
|
||||||
|
(VM_PFNMAP|VM_MIXEDMAP));
|
||||||
|
BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
|
||||||
|
BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
|
||||||
|
|
||||||
|
if (addr < vma->vm_start || addr >= vma->vm_end)
|
||||||
|
return VM_FAULT_SIGBUS;
|
||||||
|
if (track_pfn_insert(vma, &pgprot, pfn))
|
||||||
|
return VM_FAULT_SIGBUS;
|
||||||
|
insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write);
|
||||||
|
return VM_FAULT_NOPAGE;
|
||||||
|
}
|
||||||
|
|
||||||
int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
|
int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
|
||||||
pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
|
pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
|
||||||
struct vm_area_struct *vma)
|
struct vm_area_struct *vma)
|
||||||
|
@ -1414,41 +1455,41 @@ out:
|
||||||
int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
|
int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
|
||||||
pmd_t *pmd, unsigned long addr)
|
pmd_t *pmd, unsigned long addr)
|
||||||
{
|
{
|
||||||
|
pmd_t orig_pmd;
|
||||||
spinlock_t *ptl;
|
spinlock_t *ptl;
|
||||||
int ret = 0;
|
|
||||||
|
|
||||||
if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
|
if (__pmd_trans_huge_lock(pmd, vma, &ptl) != 1)
|
||||||
struct page *page;
|
return 0;
|
||||||
pgtable_t pgtable;
|
/*
|
||||||
pmd_t orig_pmd;
|
* For architectures like ppc64 we look at deposited pgtable
|
||||||
/*
|
* when calling pmdp_huge_get_and_clear. So do the
|
||||||
* For architectures like ppc64 we look at deposited pgtable
|
* pgtable_trans_huge_withdraw after finishing pmdp related
|
||||||
* when calling pmdp_huge_get_and_clear. So do the
|
* operations.
|
||||||
* pgtable_trans_huge_withdraw after finishing pmdp related
|
*/
|
||||||
* operations.
|
orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
|
||||||
*/
|
tlb->fullmm);
|
||||||
orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
|
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
|
||||||
tlb->fullmm);
|
if (vma_is_dax(vma)) {
|
||||||
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
|
spin_unlock(ptl);
|
||||||
pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
|
if (is_huge_zero_pmd(orig_pmd))
|
||||||
if (is_huge_zero_pmd(orig_pmd)) {
|
|
||||||
atomic_long_dec(&tlb->mm->nr_ptes);
|
|
||||||
spin_unlock(ptl);
|
|
||||||
put_huge_zero_page();
|
put_huge_zero_page();
|
||||||
} else {
|
} else if (is_huge_zero_pmd(orig_pmd)) {
|
||||||
page = pmd_page(orig_pmd);
|
pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd));
|
||||||
page_remove_rmap(page);
|
atomic_long_dec(&tlb->mm->nr_ptes);
|
||||||
VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
|
spin_unlock(ptl);
|
||||||
add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
|
put_huge_zero_page();
|
||||||
VM_BUG_ON_PAGE(!PageHead(page), page);
|
} else {
|
||||||
atomic_long_dec(&tlb->mm->nr_ptes);
|
struct page *page = pmd_page(orig_pmd);
|
||||||
spin_unlock(ptl);
|
page_remove_rmap(page);
|
||||||
tlb_remove_page(tlb, page);
|
VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
|
||||||
}
|
add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
|
||||||
pte_free(tlb->mm, pgtable);
|
VM_BUG_ON_PAGE(!PageHead(page), page);
|
||||||
ret = 1;
|
pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd));
|
||||||
|
atomic_long_dec(&tlb->mm->nr_ptes);
|
||||||
|
spin_unlock(ptl);
|
||||||
|
tlb_remove_page(tlb, page);
|
||||||
}
|
}
|
||||||
return ret;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
|
int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
|
||||||
|
@ -2285,8 +2326,12 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
|
||||||
|
|
||||||
static void khugepaged_alloc_sleep(void)
|
static void khugepaged_alloc_sleep(void)
|
||||||
{
|
{
|
||||||
wait_event_freezable_timeout(khugepaged_wait, false,
|
DEFINE_WAIT(wait);
|
||||||
msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
|
|
||||||
|
add_wait_queue(&khugepaged_wait, &wait);
|
||||||
|
freezable_schedule_timeout_interruptible(
|
||||||
|
msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
|
||||||
|
remove_wait_queue(&khugepaged_wait, &wait);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int khugepaged_node_load[MAX_NUMNODES];
|
static int khugepaged_node_load[MAX_NUMNODES];
|
||||||
|
@ -2373,7 +2418,7 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm,
|
||||||
*/
|
*/
|
||||||
up_read(&mm->mmap_sem);
|
up_read(&mm->mmap_sem);
|
||||||
|
|
||||||
*hpage = alloc_pages_exact_node(node, gfp, HPAGE_PMD_ORDER);
|
*hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER);
|
||||||
if (unlikely(!*hpage)) {
|
if (unlikely(!*hpage)) {
|
||||||
count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
|
count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
|
||||||
*hpage = ERR_PTR(-ENOMEM);
|
*hpage = ERR_PTR(-ENOMEM);
|
||||||
|
@ -2911,7 +2956,7 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
|
||||||
pmd_t *pmd)
|
pmd_t *pmd)
|
||||||
{
|
{
|
||||||
spinlock_t *ptl;
|
spinlock_t *ptl;
|
||||||
struct page *page;
|
struct page *page = NULL;
|
||||||
struct mm_struct *mm = vma->vm_mm;
|
struct mm_struct *mm = vma->vm_mm;
|
||||||
unsigned long haddr = address & HPAGE_PMD_MASK;
|
unsigned long haddr = address & HPAGE_PMD_MASK;
|
||||||
unsigned long mmun_start; /* For mmu_notifiers */
|
unsigned long mmun_start; /* For mmu_notifiers */
|
||||||
|
@ -2924,25 +2969,27 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
|
||||||
again:
|
again:
|
||||||
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
|
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
|
||||||
ptl = pmd_lock(mm, pmd);
|
ptl = pmd_lock(mm, pmd);
|
||||||
if (unlikely(!pmd_trans_huge(*pmd))) {
|
if (unlikely(!pmd_trans_huge(*pmd)))
|
||||||
spin_unlock(ptl);
|
goto unlock;
|
||||||
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
|
if (vma_is_dax(vma)) {
|
||||||
return;
|
pmd_t _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
|
||||||
}
|
if (is_huge_zero_pmd(_pmd))
|
||||||
if (is_huge_zero_pmd(*pmd)) {
|
put_huge_zero_page();
|
||||||
|
} else if (is_huge_zero_pmd(*pmd)) {
|
||||||
__split_huge_zero_page_pmd(vma, haddr, pmd);
|
__split_huge_zero_page_pmd(vma, haddr, pmd);
|
||||||
spin_unlock(ptl);
|
} else {
|
||||||
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
|
page = pmd_page(*pmd);
|
||||||
return;
|
VM_BUG_ON_PAGE(!page_count(page), page);
|
||||||
|
get_page(page);
|
||||||
}
|
}
|
||||||
page = pmd_page(*pmd);
|
unlock:
|
||||||
VM_BUG_ON_PAGE(!page_count(page), page);
|
|
||||||
get_page(page);
|
|
||||||
spin_unlock(ptl);
|
spin_unlock(ptl);
|
||||||
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
|
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
|
||||||
|
|
||||||
split_huge_page(page);
|
if (!page)
|
||||||
|
return;
|
||||||
|
|
||||||
|
split_huge_page(page);
|
||||||
put_page(page);
|
put_page(page);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -2991,7 +3038,7 @@ static void split_huge_page_address(struct mm_struct *mm,
|
||||||
split_huge_page_pmd_mm(mm, address, pmd);
|
split_huge_page_pmd_mm(mm, address, pmd);
|
||||||
}
|
}
|
||||||
|
|
||||||
void __vma_adjust_trans_huge(struct vm_area_struct *vma,
|
void vma_adjust_trans_huge(struct vm_area_struct *vma,
|
||||||
unsigned long start,
|
unsigned long start,
|
||||||
unsigned long end,
|
unsigned long end,
|
||||||
long adjust_next)
|
long adjust_next)
|
||||||
|
|
436
mm/hugetlb.c
436
mm/hugetlb.c
|
@ -64,7 +64,7 @@ DEFINE_SPINLOCK(hugetlb_lock);
|
||||||
* prevent spurious OOMs when the hugepage pool is fully utilized.
|
* prevent spurious OOMs when the hugepage pool is fully utilized.
|
||||||
*/
|
*/
|
||||||
static int num_fault_mutexes;
|
static int num_fault_mutexes;
|
||||||
static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp;
|
struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
|
||||||
|
|
||||||
/* Forward declaration */
|
/* Forward declaration */
|
||||||
static int hugetlb_acct_memory(struct hstate *h, long delta);
|
static int hugetlb_acct_memory(struct hstate *h, long delta);
|
||||||
|
@ -240,11 +240,14 @@ struct file_region {
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Add the huge page range represented by [f, t) to the reserve
|
* Add the huge page range represented by [f, t) to the reserve
|
||||||
* map. Existing regions will be expanded to accommodate the
|
* map. In the normal case, existing regions will be expanded
|
||||||
* specified range. We know only existing regions need to be
|
* to accommodate the specified range. Sufficient regions should
|
||||||
* expanded, because region_add is only called after region_chg
|
* exist for expansion due to the previous call to region_chg
|
||||||
* with the same range. If a new file_region structure must
|
* with the same range. However, it is possible that region_del
|
||||||
* be allocated, it is done in region_chg.
|
* could have been called after region_chg and modifed the map
|
||||||
|
* in such a way that no region exists to be expanded. In this
|
||||||
|
* case, pull a region descriptor from the cache associated with
|
||||||
|
* the map and use that for the new range.
|
||||||
*
|
*
|
||||||
* Return the number of new huge pages added to the map. This
|
* Return the number of new huge pages added to the map. This
|
||||||
* number is greater than or equal to zero.
|
* number is greater than or equal to zero.
|
||||||
|
@ -261,6 +264,28 @@ static long region_add(struct resv_map *resv, long f, long t)
|
||||||
if (f <= rg->to)
|
if (f <= rg->to)
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If no region exists which can be expanded to include the
|
||||||
|
* specified range, the list must have been modified by an
|
||||||
|
* interleving call to region_del(). Pull a region descriptor
|
||||||
|
* from the cache and use it for this range.
|
||||||
|
*/
|
||||||
|
if (&rg->link == head || t < rg->from) {
|
||||||
|
VM_BUG_ON(resv->region_cache_count <= 0);
|
||||||
|
|
||||||
|
resv->region_cache_count--;
|
||||||
|
nrg = list_first_entry(&resv->region_cache, struct file_region,
|
||||||
|
link);
|
||||||
|
list_del(&nrg->link);
|
||||||
|
|
||||||
|
nrg->from = f;
|
||||||
|
nrg->to = t;
|
||||||
|
list_add(&nrg->link, rg->link.prev);
|
||||||
|
|
||||||
|
add += t - f;
|
||||||
|
goto out_locked;
|
||||||
|
}
|
||||||
|
|
||||||
/* Round our left edge to the current segment if it encloses us. */
|
/* Round our left edge to the current segment if it encloses us. */
|
||||||
if (f > rg->from)
|
if (f > rg->from)
|
||||||
f = rg->from;
|
f = rg->from;
|
||||||
|
@ -294,6 +319,8 @@ static long region_add(struct resv_map *resv, long f, long t)
|
||||||
add += t - nrg->to; /* Added to end of region */
|
add += t - nrg->to; /* Added to end of region */
|
||||||
nrg->to = t;
|
nrg->to = t;
|
||||||
|
|
||||||
|
out_locked:
|
||||||
|
resv->adds_in_progress--;
|
||||||
spin_unlock(&resv->lock);
|
spin_unlock(&resv->lock);
|
||||||
VM_BUG_ON(add < 0);
|
VM_BUG_ON(add < 0);
|
||||||
return add;
|
return add;
|
||||||
|
@ -312,11 +339,14 @@ static long region_add(struct resv_map *resv, long f, long t)
|
||||||
* so that the subsequent region_add call will have all the
|
* so that the subsequent region_add call will have all the
|
||||||
* regions it needs and will not fail.
|
* regions it needs and will not fail.
|
||||||
*
|
*
|
||||||
* Returns the number of huge pages that need to be added
|
* Upon entry, region_chg will also examine the cache of region descriptors
|
||||||
* to the existing reservation map for the range [f, t).
|
* associated with the map. If there are not enough descriptors cached, one
|
||||||
* This number is greater or equal to zero. -ENOMEM is
|
* will be allocated for the in progress add operation.
|
||||||
* returned if a new file_region structure is needed and can
|
*
|
||||||
* not be allocated.
|
* Returns the number of huge pages that need to be added to the existing
|
||||||
|
* reservation map for the range [f, t). This number is greater or equal to
|
||||||
|
* zero. -ENOMEM is returned if a new file_region structure or cache entry
|
||||||
|
* is needed and can not be allocated.
|
||||||
*/
|
*/
|
||||||
static long region_chg(struct resv_map *resv, long f, long t)
|
static long region_chg(struct resv_map *resv, long f, long t)
|
||||||
{
|
{
|
||||||
|
@ -326,6 +356,31 @@ static long region_chg(struct resv_map *resv, long f, long t)
|
||||||
|
|
||||||
retry:
|
retry:
|
||||||
spin_lock(&resv->lock);
|
spin_lock(&resv->lock);
|
||||||
|
retry_locked:
|
||||||
|
resv->adds_in_progress++;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Check for sufficient descriptors in the cache to accommodate
|
||||||
|
* the number of in progress add operations.
|
||||||
|
*/
|
||||||
|
if (resv->adds_in_progress > resv->region_cache_count) {
|
||||||
|
struct file_region *trg;
|
||||||
|
|
||||||
|
VM_BUG_ON(resv->adds_in_progress - resv->region_cache_count > 1);
|
||||||
|
/* Must drop lock to allocate a new descriptor. */
|
||||||
|
resv->adds_in_progress--;
|
||||||
|
spin_unlock(&resv->lock);
|
||||||
|
|
||||||
|
trg = kmalloc(sizeof(*trg), GFP_KERNEL);
|
||||||
|
if (!trg)
|
||||||
|
return -ENOMEM;
|
||||||
|
|
||||||
|
spin_lock(&resv->lock);
|
||||||
|
list_add(&trg->link, &resv->region_cache);
|
||||||
|
resv->region_cache_count++;
|
||||||
|
goto retry_locked;
|
||||||
|
}
|
||||||
|
|
||||||
/* Locate the region we are before or in. */
|
/* Locate the region we are before or in. */
|
||||||
list_for_each_entry(rg, head, link)
|
list_for_each_entry(rg, head, link)
|
||||||
if (f <= rg->to)
|
if (f <= rg->to)
|
||||||
|
@ -336,6 +391,7 @@ retry:
|
||||||
* size such that we can guarantee to record the reservation. */
|
* size such that we can guarantee to record the reservation. */
|
||||||
if (&rg->link == head || t < rg->from) {
|
if (&rg->link == head || t < rg->from) {
|
||||||
if (!nrg) {
|
if (!nrg) {
|
||||||
|
resv->adds_in_progress--;
|
||||||
spin_unlock(&resv->lock);
|
spin_unlock(&resv->lock);
|
||||||
nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
|
nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
|
||||||
if (!nrg)
|
if (!nrg)
|
||||||
|
@ -385,43 +441,131 @@ out_nrg:
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Truncate the reserve map at index 'end'. Modify/truncate any
|
* Abort the in progress add operation. The adds_in_progress field
|
||||||
* region which contains end. Delete any regions past end.
|
* of the resv_map keeps track of the operations in progress between
|
||||||
* Return the number of huge pages removed from the map.
|
* calls to region_chg and region_add. Operations are sometimes
|
||||||
|
* aborted after the call to region_chg. In such cases, region_abort
|
||||||
|
* is called to decrement the adds_in_progress counter.
|
||||||
|
*
|
||||||
|
* NOTE: The range arguments [f, t) are not needed or used in this
|
||||||
|
* routine. They are kept to make reading the calling code easier as
|
||||||
|
* arguments will match the associated region_chg call.
|
||||||
*/
|
*/
|
||||||
static long region_truncate(struct resv_map *resv, long end)
|
static void region_abort(struct resv_map *resv, long f, long t)
|
||||||
|
{
|
||||||
|
spin_lock(&resv->lock);
|
||||||
|
VM_BUG_ON(!resv->region_cache_count);
|
||||||
|
resv->adds_in_progress--;
|
||||||
|
spin_unlock(&resv->lock);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Delete the specified range [f, t) from the reserve map. If the
|
||||||
|
* t parameter is LONG_MAX, this indicates that ALL regions after f
|
||||||
|
* should be deleted. Locate the regions which intersect [f, t)
|
||||||
|
* and either trim, delete or split the existing regions.
|
||||||
|
*
|
||||||
|
* Returns the number of huge pages deleted from the reserve map.
|
||||||
|
* In the normal case, the return value is zero or more. In the
|
||||||
|
* case where a region must be split, a new region descriptor must
|
||||||
|
* be allocated. If the allocation fails, -ENOMEM will be returned.
|
||||||
|
* NOTE: If the parameter t == LONG_MAX, then we will never split
|
||||||
|
* a region and possibly return -ENOMEM. Callers specifying
|
||||||
|
* t == LONG_MAX do not need to check for -ENOMEM error.
|
||||||
|
*/
|
||||||
|
static long region_del(struct resv_map *resv, long f, long t)
|
||||||
{
|
{
|
||||||
struct list_head *head = &resv->regions;
|
struct list_head *head = &resv->regions;
|
||||||
struct file_region *rg, *trg;
|
struct file_region *rg, *trg;
|
||||||
long chg = 0;
|
struct file_region *nrg = NULL;
|
||||||
|
long del = 0;
|
||||||
|
|
||||||
|
retry:
|
||||||
spin_lock(&resv->lock);
|
spin_lock(&resv->lock);
|
||||||
/* Locate the region we are either in or before. */
|
list_for_each_entry_safe(rg, trg, head, link) {
|
||||||
list_for_each_entry(rg, head, link)
|
if (rg->to <= f)
|
||||||
if (end <= rg->to)
|
continue;
|
||||||
|
if (rg->from >= t)
|
||||||
break;
|
break;
|
||||||
if (&rg->link == head)
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
/* If we are in the middle of a region then adjust it. */
|
if (f > rg->from && t < rg->to) { /* Must split region */
|
||||||
if (end > rg->from) {
|
/*
|
||||||
chg = rg->to - end;
|
* Check for an entry in the cache before dropping
|
||||||
rg->to = end;
|
* lock and attempting allocation.
|
||||||
rg = list_entry(rg->link.next, typeof(*rg), link);
|
*/
|
||||||
|
if (!nrg &&
|
||||||
|
resv->region_cache_count > resv->adds_in_progress) {
|
||||||
|
nrg = list_first_entry(&resv->region_cache,
|
||||||
|
struct file_region,
|
||||||
|
link);
|
||||||
|
list_del(&nrg->link);
|
||||||
|
resv->region_cache_count--;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!nrg) {
|
||||||
|
spin_unlock(&resv->lock);
|
||||||
|
nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
|
||||||
|
if (!nrg)
|
||||||
|
return -ENOMEM;
|
||||||
|
goto retry;
|
||||||
|
}
|
||||||
|
|
||||||
|
del += t - f;
|
||||||
|
|
||||||
|
/* New entry for end of split region */
|
||||||
|
nrg->from = t;
|
||||||
|
nrg->to = rg->to;
|
||||||
|
INIT_LIST_HEAD(&nrg->link);
|
||||||
|
|
||||||
|
/* Original entry is trimmed */
|
||||||
|
rg->to = f;
|
||||||
|
|
||||||
|
list_add(&nrg->link, &rg->link);
|
||||||
|
nrg = NULL;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (f <= rg->from && t >= rg->to) { /* Remove entire region */
|
||||||
|
del += rg->to - rg->from;
|
||||||
|
list_del(&rg->link);
|
||||||
|
kfree(rg);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (f <= rg->from) { /* Trim beginning of region */
|
||||||
|
del += t - rg->from;
|
||||||
|
rg->from = t;
|
||||||
|
} else { /* Trim end of region */
|
||||||
|
del += rg->to - f;
|
||||||
|
rg->to = f;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Drop any remaining regions. */
|
|
||||||
list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
|
|
||||||
if (&rg->link == head)
|
|
||||||
break;
|
|
||||||
chg += rg->to - rg->from;
|
|
||||||
list_del(&rg->link);
|
|
||||||
kfree(rg);
|
|
||||||
}
|
|
||||||
|
|
||||||
out:
|
|
||||||
spin_unlock(&resv->lock);
|
spin_unlock(&resv->lock);
|
||||||
return chg;
|
kfree(nrg);
|
||||||
|
return del;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* A rare out of memory error was encountered which prevented removal of
|
||||||
|
* the reserve map region for a page. The huge page itself was free'ed
|
||||||
|
* and removed from the page cache. This routine will adjust the subpool
|
||||||
|
* usage count, and the global reserve count if needed. By incrementing
|
||||||
|
* these counts, the reserve map entry which could not be deleted will
|
||||||
|
* appear as a "reserved" entry instead of simply dangling with incorrect
|
||||||
|
* counts.
|
||||||
|
*/
|
||||||
|
void hugetlb_fix_reserve_counts(struct inode *inode, bool restore_reserve)
|
||||||
|
{
|
||||||
|
struct hugepage_subpool *spool = subpool_inode(inode);
|
||||||
|
long rsv_adjust;
|
||||||
|
|
||||||
|
rsv_adjust = hugepage_subpool_get_pages(spool, 1);
|
||||||
|
if (restore_reserve && rsv_adjust) {
|
||||||
|
struct hstate *h = hstate_inode(inode);
|
||||||
|
|
||||||
|
hugetlb_acct_memory(h, 1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -544,22 +688,44 @@ static void set_vma_private_data(struct vm_area_struct *vma,
|
||||||
struct resv_map *resv_map_alloc(void)
|
struct resv_map *resv_map_alloc(void)
|
||||||
{
|
{
|
||||||
struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
|
struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
|
||||||
if (!resv_map)
|
struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL);
|
||||||
|
|
||||||
|
if (!resv_map || !rg) {
|
||||||
|
kfree(resv_map);
|
||||||
|
kfree(rg);
|
||||||
return NULL;
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
kref_init(&resv_map->refs);
|
kref_init(&resv_map->refs);
|
||||||
spin_lock_init(&resv_map->lock);
|
spin_lock_init(&resv_map->lock);
|
||||||
INIT_LIST_HEAD(&resv_map->regions);
|
INIT_LIST_HEAD(&resv_map->regions);
|
||||||
|
|
||||||
|
resv_map->adds_in_progress = 0;
|
||||||
|
|
||||||
|
INIT_LIST_HEAD(&resv_map->region_cache);
|
||||||
|
list_add(&rg->link, &resv_map->region_cache);
|
||||||
|
resv_map->region_cache_count = 1;
|
||||||
|
|
||||||
return resv_map;
|
return resv_map;
|
||||||
}
|
}
|
||||||
|
|
||||||
void resv_map_release(struct kref *ref)
|
void resv_map_release(struct kref *ref)
|
||||||
{
|
{
|
||||||
struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
|
struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
|
||||||
|
struct list_head *head = &resv_map->region_cache;
|
||||||
|
struct file_region *rg, *trg;
|
||||||
|
|
||||||
/* Clear out any active regions before we release the map. */
|
/* Clear out any active regions before we release the map. */
|
||||||
region_truncate(resv_map, 0);
|
region_del(resv_map, 0, LONG_MAX);
|
||||||
|
|
||||||
|
/* ... and any entries left in the cache */
|
||||||
|
list_for_each_entry_safe(rg, trg, head, link) {
|
||||||
|
list_del(&rg->link);
|
||||||
|
kfree(rg);
|
||||||
|
}
|
||||||
|
|
||||||
|
VM_BUG_ON(resv_map->adds_in_progress);
|
||||||
|
|
||||||
kfree(resv_map);
|
kfree(resv_map);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -635,8 +801,19 @@ static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Shared mappings always use reserves */
|
/* Shared mappings always use reserves */
|
||||||
if (vma->vm_flags & VM_MAYSHARE)
|
if (vma->vm_flags & VM_MAYSHARE) {
|
||||||
return true;
|
/*
|
||||||
|
* We know VM_NORESERVE is not set. Therefore, there SHOULD
|
||||||
|
* be a region map for all pages. The only situation where
|
||||||
|
* there is no region map is if a hole was punched via
|
||||||
|
* fallocate. In this case, there really are no reverves to
|
||||||
|
* use. This situation is indicated if chg != 0.
|
||||||
|
*/
|
||||||
|
if (chg)
|
||||||
|
return false;
|
||||||
|
else
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Only the process that called mmap() has reserves for
|
* Only the process that called mmap() has reserves for
|
||||||
|
@ -1154,7 +1331,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
|
||||||
{
|
{
|
||||||
struct page *page;
|
struct page *page;
|
||||||
|
|
||||||
page = alloc_pages_exact_node(nid,
|
page = __alloc_pages_node(nid,
|
||||||
htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
|
htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
|
||||||
__GFP_REPEAT|__GFP_NOWARN,
|
__GFP_REPEAT|__GFP_NOWARN,
|
||||||
huge_page_order(h));
|
huge_page_order(h));
|
||||||
|
@ -1306,7 +1483,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
|
||||||
__GFP_REPEAT|__GFP_NOWARN,
|
__GFP_REPEAT|__GFP_NOWARN,
|
||||||
huge_page_order(h));
|
huge_page_order(h));
|
||||||
else
|
else
|
||||||
page = alloc_pages_exact_node(nid,
|
page = __alloc_pages_node(nid,
|
||||||
htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
|
htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
|
||||||
__GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
|
__GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
|
||||||
|
|
||||||
|
@ -1473,16 +1650,19 @@ static void return_unused_surplus_pages(struct hstate *h,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* vma_needs_reservation and vma_commit_reservation are used by the huge
|
* vma_needs_reservation, vma_commit_reservation and vma_end_reservation
|
||||||
* page allocation routines to manage reservations.
|
* are used by the huge page allocation routines to manage reservations.
|
||||||
*
|
*
|
||||||
* vma_needs_reservation is called to determine if the huge page at addr
|
* vma_needs_reservation is called to determine if the huge page at addr
|
||||||
* within the vma has an associated reservation. If a reservation is
|
* within the vma has an associated reservation. If a reservation is
|
||||||
* needed, the value 1 is returned. The caller is then responsible for
|
* needed, the value 1 is returned. The caller is then responsible for
|
||||||
* managing the global reservation and subpool usage counts. After
|
* managing the global reservation and subpool usage counts. After
|
||||||
* the huge page has been allocated, vma_commit_reservation is called
|
* the huge page has been allocated, vma_commit_reservation is called
|
||||||
* to add the page to the reservation map.
|
* to add the page to the reservation map. If the page allocation fails,
|
||||||
|
* the reservation must be ended instead of committed. vma_end_reservation
|
||||||
|
* is called in such cases.
|
||||||
*
|
*
|
||||||
* In the normal case, vma_commit_reservation returns the same value
|
* In the normal case, vma_commit_reservation returns the same value
|
||||||
* as the preceding vma_needs_reservation call. The only time this
|
* as the preceding vma_needs_reservation call. The only time this
|
||||||
|
@ -1490,9 +1670,14 @@ static void return_unused_surplus_pages(struct hstate *h,
|
||||||
* is the responsibility of the caller to notice the difference and
|
* is the responsibility of the caller to notice the difference and
|
||||||
* take appropriate action.
|
* take appropriate action.
|
||||||
*/
|
*/
|
||||||
|
enum vma_resv_mode {
|
||||||
|
VMA_NEEDS_RESV,
|
||||||
|
VMA_COMMIT_RESV,
|
||||||
|
VMA_END_RESV,
|
||||||
|
};
|
||||||
static long __vma_reservation_common(struct hstate *h,
|
static long __vma_reservation_common(struct hstate *h,
|
||||||
struct vm_area_struct *vma, unsigned long addr,
|
struct vm_area_struct *vma, unsigned long addr,
|
||||||
bool commit)
|
enum vma_resv_mode mode)
|
||||||
{
|
{
|
||||||
struct resv_map *resv;
|
struct resv_map *resv;
|
||||||
pgoff_t idx;
|
pgoff_t idx;
|
||||||
|
@ -1503,10 +1688,20 @@ static long __vma_reservation_common(struct hstate *h,
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
idx = vma_hugecache_offset(h, vma, addr);
|
idx = vma_hugecache_offset(h, vma, addr);
|
||||||
if (commit)
|
switch (mode) {
|
||||||
ret = region_add(resv, idx, idx + 1);
|
case VMA_NEEDS_RESV:
|
||||||
else
|
|
||||||
ret = region_chg(resv, idx, idx + 1);
|
ret = region_chg(resv, idx, idx + 1);
|
||||||
|
break;
|
||||||
|
case VMA_COMMIT_RESV:
|
||||||
|
ret = region_add(resv, idx, idx + 1);
|
||||||
|
break;
|
||||||
|
case VMA_END_RESV:
|
||||||
|
region_abort(resv, idx, idx + 1);
|
||||||
|
ret = 0;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
BUG();
|
||||||
|
}
|
||||||
|
|
||||||
if (vma->vm_flags & VM_MAYSHARE)
|
if (vma->vm_flags & VM_MAYSHARE)
|
||||||
return ret;
|
return ret;
|
||||||
|
@ -1517,47 +1712,79 @@ static long __vma_reservation_common(struct hstate *h,
|
||||||
static long vma_needs_reservation(struct hstate *h,
|
static long vma_needs_reservation(struct hstate *h,
|
||||||
struct vm_area_struct *vma, unsigned long addr)
|
struct vm_area_struct *vma, unsigned long addr)
|
||||||
{
|
{
|
||||||
return __vma_reservation_common(h, vma, addr, false);
|
return __vma_reservation_common(h, vma, addr, VMA_NEEDS_RESV);
|
||||||
}
|
}
|
||||||
|
|
||||||
static long vma_commit_reservation(struct hstate *h,
|
static long vma_commit_reservation(struct hstate *h,
|
||||||
struct vm_area_struct *vma, unsigned long addr)
|
struct vm_area_struct *vma, unsigned long addr)
|
||||||
{
|
{
|
||||||
return __vma_reservation_common(h, vma, addr, true);
|
return __vma_reservation_common(h, vma, addr, VMA_COMMIT_RESV);
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct page *alloc_huge_page(struct vm_area_struct *vma,
|
static void vma_end_reservation(struct hstate *h,
|
||||||
|
struct vm_area_struct *vma, unsigned long addr)
|
||||||
|
{
|
||||||
|
(void)__vma_reservation_common(h, vma, addr, VMA_END_RESV);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct page *alloc_huge_page(struct vm_area_struct *vma,
|
||||||
unsigned long addr, int avoid_reserve)
|
unsigned long addr, int avoid_reserve)
|
||||||
{
|
{
|
||||||
struct hugepage_subpool *spool = subpool_vma(vma);
|
struct hugepage_subpool *spool = subpool_vma(vma);
|
||||||
struct hstate *h = hstate_vma(vma);
|
struct hstate *h = hstate_vma(vma);
|
||||||
struct page *page;
|
struct page *page;
|
||||||
long chg, commit;
|
long map_chg, map_commit;
|
||||||
|
long gbl_chg;
|
||||||
int ret, idx;
|
int ret, idx;
|
||||||
struct hugetlb_cgroup *h_cg;
|
struct hugetlb_cgroup *h_cg;
|
||||||
|
|
||||||
idx = hstate_index(h);
|
idx = hstate_index(h);
|
||||||
/*
|
/*
|
||||||
* Processes that did not create the mapping will have no
|
* Examine the region/reserve map to determine if the process
|
||||||
* reserves and will not have accounted against subpool
|
* has a reservation for the page to be allocated. A return
|
||||||
* limit. Check that the subpool limit can be made before
|
* code of zero indicates a reservation exists (no change).
|
||||||
* satisfying the allocation MAP_NORESERVE mappings may also
|
|
||||||
* need pages and subpool limit allocated allocated if no reserve
|
|
||||||
* mapping overlaps.
|
|
||||||
*/
|
*/
|
||||||
chg = vma_needs_reservation(h, vma, addr);
|
map_chg = gbl_chg = vma_needs_reservation(h, vma, addr);
|
||||||
if (chg < 0)
|
if (map_chg < 0)
|
||||||
return ERR_PTR(-ENOMEM);
|
return ERR_PTR(-ENOMEM);
|
||||||
if (chg || avoid_reserve)
|
|
||||||
if (hugepage_subpool_get_pages(spool, 1) < 0)
|
/*
|
||||||
|
* Processes that did not create the mapping will have no
|
||||||
|
* reserves as indicated by the region/reserve map. Check
|
||||||
|
* that the allocation will not exceed the subpool limit.
|
||||||
|
* Allocations for MAP_NORESERVE mappings also need to be
|
||||||
|
* checked against any subpool limit.
|
||||||
|
*/
|
||||||
|
if (map_chg || avoid_reserve) {
|
||||||
|
gbl_chg = hugepage_subpool_get_pages(spool, 1);
|
||||||
|
if (gbl_chg < 0) {
|
||||||
|
vma_end_reservation(h, vma, addr);
|
||||||
return ERR_PTR(-ENOSPC);
|
return ERR_PTR(-ENOSPC);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Even though there was no reservation in the region/reserve
|
||||||
|
* map, there could be reservations associated with the
|
||||||
|
* subpool that can be used. This would be indicated if the
|
||||||
|
* return value of hugepage_subpool_get_pages() is zero.
|
||||||
|
* However, if avoid_reserve is specified we still avoid even
|
||||||
|
* the subpool reservations.
|
||||||
|
*/
|
||||||
|
if (avoid_reserve)
|
||||||
|
gbl_chg = 1;
|
||||||
|
}
|
||||||
|
|
||||||
ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
|
ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
|
||||||
if (ret)
|
if (ret)
|
||||||
goto out_subpool_put;
|
goto out_subpool_put;
|
||||||
|
|
||||||
spin_lock(&hugetlb_lock);
|
spin_lock(&hugetlb_lock);
|
||||||
page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, chg);
|
/*
|
||||||
|
* glb_chg is passed to indicate whether or not a page must be taken
|
||||||
|
* from the global free pool (global change). gbl_chg == 0 indicates
|
||||||
|
* a reservation exists for the allocation.
|
||||||
|
*/
|
||||||
|
page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
|
||||||
if (!page) {
|
if (!page) {
|
||||||
spin_unlock(&hugetlb_lock);
|
spin_unlock(&hugetlb_lock);
|
||||||
page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
|
page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
|
||||||
|
@ -1573,8 +1800,8 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
|
||||||
|
|
||||||
set_page_private(page, (unsigned long)spool);
|
set_page_private(page, (unsigned long)spool);
|
||||||
|
|
||||||
commit = vma_commit_reservation(h, vma, addr);
|
map_commit = vma_commit_reservation(h, vma, addr);
|
||||||
if (unlikely(chg > commit)) {
|
if (unlikely(map_chg > map_commit)) {
|
||||||
/*
|
/*
|
||||||
* The page was added to the reservation map between
|
* The page was added to the reservation map between
|
||||||
* vma_needs_reservation and vma_commit_reservation.
|
* vma_needs_reservation and vma_commit_reservation.
|
||||||
|
@ -1594,8 +1821,9 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
|
||||||
out_uncharge_cgroup:
|
out_uncharge_cgroup:
|
||||||
hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
|
hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
|
||||||
out_subpool_put:
|
out_subpool_put:
|
||||||
if (chg || avoid_reserve)
|
if (map_chg || avoid_reserve)
|
||||||
hugepage_subpool_put_pages(spool, 1);
|
hugepage_subpool_put_pages(spool, 1);
|
||||||
|
vma_end_reservation(h, vma, addr);
|
||||||
return ERR_PTR(-ENOSPC);
|
return ERR_PTR(-ENOSPC);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2311,7 +2539,7 @@ static void __exit hugetlb_exit(void)
|
||||||
}
|
}
|
||||||
|
|
||||||
kobject_put(hugepages_kobj);
|
kobject_put(hugepages_kobj);
|
||||||
kfree(htlb_fault_mutex_table);
|
kfree(hugetlb_fault_mutex_table);
|
||||||
}
|
}
|
||||||
module_exit(hugetlb_exit);
|
module_exit(hugetlb_exit);
|
||||||
|
|
||||||
|
@ -2344,12 +2572,12 @@ static int __init hugetlb_init(void)
|
||||||
#else
|
#else
|
||||||
num_fault_mutexes = 1;
|
num_fault_mutexes = 1;
|
||||||
#endif
|
#endif
|
||||||
htlb_fault_mutex_table =
|
hugetlb_fault_mutex_table =
|
||||||
kmalloc(sizeof(struct mutex) * num_fault_mutexes, GFP_KERNEL);
|
kmalloc(sizeof(struct mutex) * num_fault_mutexes, GFP_KERNEL);
|
||||||
BUG_ON(!htlb_fault_mutex_table);
|
BUG_ON(!hugetlb_fault_mutex_table);
|
||||||
|
|
||||||
for (i = 0; i < num_fault_mutexes; i++)
|
for (i = 0; i < num_fault_mutexes; i++)
|
||||||
mutex_init(&htlb_fault_mutex_table[i]);
|
mutex_init(&hugetlb_fault_mutex_table[i]);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
module_init(hugetlb_init);
|
module_init(hugetlb_init);
|
||||||
|
@ -3147,6 +3375,23 @@ static bool hugetlbfs_pagecache_present(struct hstate *h,
|
||||||
return page != NULL;
|
return page != NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
|
||||||
|
pgoff_t idx)
|
||||||
|
{
|
||||||
|
struct inode *inode = mapping->host;
|
||||||
|
struct hstate *h = hstate_inode(inode);
|
||||||
|
int err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
|
||||||
|
|
||||||
|
if (err)
|
||||||
|
return err;
|
||||||
|
ClearPagePrivate(page);
|
||||||
|
|
||||||
|
spin_lock(&inode->i_lock);
|
||||||
|
inode->i_blocks += blocks_per_huge_page(h);
|
||||||
|
spin_unlock(&inode->i_lock);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||||
struct address_space *mapping, pgoff_t idx,
|
struct address_space *mapping, pgoff_t idx,
|
||||||
unsigned long address, pte_t *ptep, unsigned int flags)
|
unsigned long address, pte_t *ptep, unsigned int flags)
|
||||||
|
@ -3194,21 +3439,13 @@ retry:
|
||||||
set_page_huge_active(page);
|
set_page_huge_active(page);
|
||||||
|
|
||||||
if (vma->vm_flags & VM_MAYSHARE) {
|
if (vma->vm_flags & VM_MAYSHARE) {
|
||||||
int err;
|
int err = huge_add_to_page_cache(page, mapping, idx);
|
||||||
struct inode *inode = mapping->host;
|
|
||||||
|
|
||||||
err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
|
|
||||||
if (err) {
|
if (err) {
|
||||||
put_page(page);
|
put_page(page);
|
||||||
if (err == -EEXIST)
|
if (err == -EEXIST)
|
||||||
goto retry;
|
goto retry;
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
ClearPagePrivate(page);
|
|
||||||
|
|
||||||
spin_lock(&inode->i_lock);
|
|
||||||
inode->i_blocks += blocks_per_huge_page(h);
|
|
||||||
spin_unlock(&inode->i_lock);
|
|
||||||
} else {
|
} else {
|
||||||
lock_page(page);
|
lock_page(page);
|
||||||
if (unlikely(anon_vma_prepare(vma))) {
|
if (unlikely(anon_vma_prepare(vma))) {
|
||||||
|
@ -3236,11 +3473,14 @@ retry:
|
||||||
* any allocations necessary to record that reservation occur outside
|
* any allocations necessary to record that reservation occur outside
|
||||||
* the spinlock.
|
* the spinlock.
|
||||||
*/
|
*/
|
||||||
if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED))
|
if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
|
||||||
if (vma_needs_reservation(h, vma, address) < 0) {
|
if (vma_needs_reservation(h, vma, address) < 0) {
|
||||||
ret = VM_FAULT_OOM;
|
ret = VM_FAULT_OOM;
|
||||||
goto backout_unlocked;
|
goto backout_unlocked;
|
||||||
}
|
}
|
||||||
|
/* Just decrements count, does not deallocate */
|
||||||
|
vma_end_reservation(h, vma, address);
|
||||||
|
}
|
||||||
|
|
||||||
ptl = huge_pte_lockptr(h, mm, ptep);
|
ptl = huge_pte_lockptr(h, mm, ptep);
|
||||||
spin_lock(ptl);
|
spin_lock(ptl);
|
||||||
|
@ -3280,7 +3520,7 @@ backout_unlocked:
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef CONFIG_SMP
|
#ifdef CONFIG_SMP
|
||||||
static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
|
u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
|
||||||
struct vm_area_struct *vma,
|
struct vm_area_struct *vma,
|
||||||
struct address_space *mapping,
|
struct address_space *mapping,
|
||||||
pgoff_t idx, unsigned long address)
|
pgoff_t idx, unsigned long address)
|
||||||
|
@ -3305,7 +3545,7 @@ static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
|
||||||
* For uniprocesor systems we always use a single mutex, so just
|
* For uniprocesor systems we always use a single mutex, so just
|
||||||
* return 0 and avoid the hashing overhead.
|
* return 0 and avoid the hashing overhead.
|
||||||
*/
|
*/
|
||||||
static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
|
u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
|
||||||
struct vm_area_struct *vma,
|
struct vm_area_struct *vma,
|
||||||
struct address_space *mapping,
|
struct address_space *mapping,
|
||||||
pgoff_t idx, unsigned long address)
|
pgoff_t idx, unsigned long address)
|
||||||
|
@ -3353,8 +3593,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||||
* get spurious allocation failures if two CPUs race to instantiate
|
* get spurious allocation failures if two CPUs race to instantiate
|
||||||
* the same page in the page cache.
|
* the same page in the page cache.
|
||||||
*/
|
*/
|
||||||
hash = fault_mutex_hash(h, mm, vma, mapping, idx, address);
|
hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, address);
|
||||||
mutex_lock(&htlb_fault_mutex_table[hash]);
|
mutex_lock(&hugetlb_fault_mutex_table[hash]);
|
||||||
|
|
||||||
entry = huge_ptep_get(ptep);
|
entry = huge_ptep_get(ptep);
|
||||||
if (huge_pte_none(entry)) {
|
if (huge_pte_none(entry)) {
|
||||||
|
@ -3387,6 +3627,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||||
ret = VM_FAULT_OOM;
|
ret = VM_FAULT_OOM;
|
||||||
goto out_mutex;
|
goto out_mutex;
|
||||||
}
|
}
|
||||||
|
/* Just decrements count, does not deallocate */
|
||||||
|
vma_end_reservation(h, vma, address);
|
||||||
|
|
||||||
if (!(vma->vm_flags & VM_MAYSHARE))
|
if (!(vma->vm_flags & VM_MAYSHARE))
|
||||||
pagecache_page = hugetlbfs_pagecache_page(h,
|
pagecache_page = hugetlbfs_pagecache_page(h,
|
||||||
|
@ -3437,7 +3679,7 @@ out_ptl:
|
||||||
put_page(pagecache_page);
|
put_page(pagecache_page);
|
||||||
}
|
}
|
||||||
out_mutex:
|
out_mutex:
|
||||||
mutex_unlock(&htlb_fault_mutex_table[hash]);
|
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
|
||||||
/*
|
/*
|
||||||
* Generally it's safe to hold refcount during waiting page lock. But
|
* Generally it's safe to hold refcount during waiting page lock. But
|
||||||
* here we just wait to defer the next page fault to avoid busy loop and
|
* here we just wait to defer the next page fault to avoid busy loop and
|
||||||
|
@ -3726,12 +3968,15 @@ int hugetlb_reserve_pages(struct inode *inode,
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
out_err:
|
out_err:
|
||||||
|
if (!vma || vma->vm_flags & VM_MAYSHARE)
|
||||||
|
region_abort(resv_map, from, to);
|
||||||
if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
|
if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
|
||||||
kref_put(&resv_map->refs, resv_map_release);
|
kref_put(&resv_map->refs, resv_map_release);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
|
long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
|
||||||
|
long freed)
|
||||||
{
|
{
|
||||||
struct hstate *h = hstate_inode(inode);
|
struct hstate *h = hstate_inode(inode);
|
||||||
struct resv_map *resv_map = inode_resv_map(inode);
|
struct resv_map *resv_map = inode_resv_map(inode);
|
||||||
|
@ -3739,8 +3984,17 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
|
||||||
struct hugepage_subpool *spool = subpool_inode(inode);
|
struct hugepage_subpool *spool = subpool_inode(inode);
|
||||||
long gbl_reserve;
|
long gbl_reserve;
|
||||||
|
|
||||||
if (resv_map)
|
if (resv_map) {
|
||||||
chg = region_truncate(resv_map, offset);
|
chg = region_del(resv_map, start, end);
|
||||||
|
/*
|
||||||
|
* region_del() can fail in the rare case where a region
|
||||||
|
* must be split and another region descriptor can not be
|
||||||
|
* allocated. If end == LONG_MAX, it will not fail.
|
||||||
|
*/
|
||||||
|
if (chg < 0)
|
||||||
|
return chg;
|
||||||
|
}
|
||||||
|
|
||||||
spin_lock(&inode->i_lock);
|
spin_lock(&inode->i_lock);
|
||||||
inode->i_blocks -= (blocks_per_huge_page(h) * freed);
|
inode->i_blocks -= (blocks_per_huge_page(h) * freed);
|
||||||
spin_unlock(&inode->i_lock);
|
spin_unlock(&inode->i_lock);
|
||||||
|
@ -3751,6 +4005,8 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
|
||||||
*/
|
*/
|
||||||
gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
|
gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
|
||||||
hugetlb_acct_memory(h, -gbl_reserve);
|
hugetlb_acct_memory(h, -gbl_reserve);
|
||||||
|
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
|
#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
|
||||||
|
|
|
@ -58,7 +58,7 @@ inject:
|
||||||
pr_info("Injecting memory failure at pfn %#lx\n", pfn);
|
pr_info("Injecting memory failure at pfn %#lx\n", pfn);
|
||||||
return memory_failure(pfn, 18, MF_COUNT_INCREASED);
|
return memory_failure(pfn, 18, MF_COUNT_INCREASED);
|
||||||
put_out:
|
put_out:
|
||||||
put_page(p);
|
put_hwpoison_page(p);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -182,6 +182,7 @@ struct compact_control {
|
||||||
unsigned long nr_migratepages; /* Number of pages to migrate */
|
unsigned long nr_migratepages; /* Number of pages to migrate */
|
||||||
unsigned long free_pfn; /* isolate_freepages search base */
|
unsigned long free_pfn; /* isolate_freepages search base */
|
||||||
unsigned long migrate_pfn; /* isolate_migratepages search base */
|
unsigned long migrate_pfn; /* isolate_migratepages search base */
|
||||||
|
unsigned long last_migrated_pfn;/* Not yet flushed page being freed */
|
||||||
enum migrate_mode mode; /* Async or sync migration mode */
|
enum migrate_mode mode; /* Async or sync migration mode */
|
||||||
bool ignore_skip_hint; /* Scan blocks even if marked skip */
|
bool ignore_skip_hint; /* Scan blocks even if marked skip */
|
||||||
int order; /* order a direct compactor needs */
|
int order; /* order a direct compactor needs */
|
||||||
|
|
|
@ -838,6 +838,7 @@ static void __init log_early(int op_type, const void *ptr, size_t size,
|
||||||
}
|
}
|
||||||
|
|
||||||
if (crt_early_log >= ARRAY_SIZE(early_log)) {
|
if (crt_early_log >= ARRAY_SIZE(early_log)) {
|
||||||
|
crt_early_log++;
|
||||||
kmemleak_disable();
|
kmemleak_disable();
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -1882,7 +1883,7 @@ void __init kmemleak_init(void)
|
||||||
object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE);
|
object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE);
|
||||||
scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE);
|
scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE);
|
||||||
|
|
||||||
if (crt_early_log >= ARRAY_SIZE(early_log))
|
if (crt_early_log > ARRAY_SIZE(early_log))
|
||||||
pr_warning("Early log buffer exceeded (%d), please increase "
|
pr_warning("Early log buffer exceeded (%d), please increase "
|
||||||
"DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n", crt_early_log);
|
"DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n", crt_early_log);
|
||||||
|
|
||||||
|
|
|
@ -99,8 +99,8 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item)
|
||||||
struct list_lru_one *l;
|
struct list_lru_one *l;
|
||||||
|
|
||||||
spin_lock(&nlru->lock);
|
spin_lock(&nlru->lock);
|
||||||
l = list_lru_from_kmem(nlru, item);
|
|
||||||
if (list_empty(item)) {
|
if (list_empty(item)) {
|
||||||
|
l = list_lru_from_kmem(nlru, item);
|
||||||
list_add_tail(item, &l->list);
|
list_add_tail(item, &l->list);
|
||||||
l->nr_items++;
|
l->nr_items++;
|
||||||
spin_unlock(&nlru->lock);
|
spin_unlock(&nlru->lock);
|
||||||
|
@ -118,8 +118,8 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item)
|
||||||
struct list_lru_one *l;
|
struct list_lru_one *l;
|
||||||
|
|
||||||
spin_lock(&nlru->lock);
|
spin_lock(&nlru->lock);
|
||||||
l = list_lru_from_kmem(nlru, item);
|
|
||||||
if (!list_empty(item)) {
|
if (!list_empty(item)) {
|
||||||
|
l = list_lru_from_kmem(nlru, item);
|
||||||
list_del_init(item);
|
list_del_init(item);
|
||||||
l->nr_items--;
|
l->nr_items--;
|
||||||
spin_unlock(&nlru->lock);
|
spin_unlock(&nlru->lock);
|
||||||
|
|
|
@ -301,7 +301,7 @@ static long madvise_remove(struct vm_area_struct *vma,
|
||||||
|
|
||||||
*prev = NULL; /* tell sys_madvise we drop mmap_sem */
|
*prev = NULL; /* tell sys_madvise we drop mmap_sem */
|
||||||
|
|
||||||
if (vma->vm_flags & (VM_LOCKED | VM_HUGETLB))
|
if (vma->vm_flags & VM_LOCKED)
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
|
|
||||||
f = vma->vm_file;
|
f = vma->vm_file;
|
||||||
|
|
|
@ -91,7 +91,7 @@ static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, p
|
||||||
return ((base1 < (base2 + size2)) && (base2 < (base1 + size1)));
|
return ((base1 < (base2 + size2)) && (base2 < (base1 + size1)));
|
||||||
}
|
}
|
||||||
|
|
||||||
static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
|
bool __init_memblock memblock_overlaps_region(struct memblock_type *type,
|
||||||
phys_addr_t base, phys_addr_t size)
|
phys_addr_t base, phys_addr_t size)
|
||||||
{
|
{
|
||||||
unsigned long i;
|
unsigned long i;
|
||||||
|
@ -103,7 +103,7 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
return (i < type->cnt) ? i : -1;
|
return i < type->cnt;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -569,6 +569,7 @@ repeat:
|
||||||
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
|
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
|
||||||
WARN_ON(nid != memblock_get_region_node(rgn));
|
WARN_ON(nid != memblock_get_region_node(rgn));
|
||||||
#endif
|
#endif
|
||||||
|
WARN_ON(flags != rgn->flags);
|
||||||
nr_new++;
|
nr_new++;
|
||||||
if (insert)
|
if (insert)
|
||||||
memblock_insert_region(type, i++, base,
|
memblock_insert_region(type, i++, base,
|
||||||
|
@ -614,14 +615,14 @@ static int __init_memblock memblock_add_region(phys_addr_t base,
|
||||||
int nid,
|
int nid,
|
||||||
unsigned long flags)
|
unsigned long flags)
|
||||||
{
|
{
|
||||||
struct memblock_type *_rgn = &memblock.memory;
|
struct memblock_type *type = &memblock.memory;
|
||||||
|
|
||||||
memblock_dbg("memblock_add: [%#016llx-%#016llx] flags %#02lx %pF\n",
|
memblock_dbg("memblock_add: [%#016llx-%#016llx] flags %#02lx %pF\n",
|
||||||
(unsigned long long)base,
|
(unsigned long long)base,
|
||||||
(unsigned long long)base + size - 1,
|
(unsigned long long)base + size - 1,
|
||||||
flags, (void *)_RET_IP_);
|
flags, (void *)_RET_IP_);
|
||||||
|
|
||||||
return memblock_add_range(_rgn, base, size, nid, flags);
|
return memblock_add_range(type, base, size, nid, flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
|
int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
|
||||||
|
@ -761,7 +762,7 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
|
||||||
*
|
*
|
||||||
* This function isolates region [@base, @base + @size), and sets/clears flag
|
* This function isolates region [@base, @base + @size), and sets/clears flag
|
||||||
*
|
*
|
||||||
* Return 0 on succees, -errno on failure.
|
* Return 0 on success, -errno on failure.
|
||||||
*/
|
*/
|
||||||
static int __init_memblock memblock_setclr_flag(phys_addr_t base,
|
static int __init_memblock memblock_setclr_flag(phys_addr_t base,
|
||||||
phys_addr_t size, int set, int flag)
|
phys_addr_t size, int set, int flag)
|
||||||
|
@ -788,7 +789,7 @@ static int __init_memblock memblock_setclr_flag(phys_addr_t base,
|
||||||
* @base: the base phys addr of the region
|
* @base: the base phys addr of the region
|
||||||
* @size: the size of the region
|
* @size: the size of the region
|
||||||
*
|
*
|
||||||
* Return 0 on succees, -errno on failure.
|
* Return 0 on success, -errno on failure.
|
||||||
*/
|
*/
|
||||||
int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size)
|
int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size)
|
||||||
{
|
{
|
||||||
|
@ -800,7 +801,7 @@ int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size)
|
||||||
* @base: the base phys addr of the region
|
* @base: the base phys addr of the region
|
||||||
* @size: the size of the region
|
* @size: the size of the region
|
||||||
*
|
*
|
||||||
* Return 0 on succees, -errno on failure.
|
* Return 0 on success, -errno on failure.
|
||||||
*/
|
*/
|
||||||
int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
|
int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
|
||||||
{
|
{
|
||||||
|
@ -812,7 +813,7 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
|
||||||
* @base: the base phys addr of the region
|
* @base: the base phys addr of the region
|
||||||
* @size: the size of the region
|
* @size: the size of the region
|
||||||
*
|
*
|
||||||
* Return 0 on succees, -errno on failure.
|
* Return 0 on success, -errno on failure.
|
||||||
*/
|
*/
|
||||||
int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size)
|
int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size)
|
||||||
{
|
{
|
||||||
|
@ -834,10 +835,10 @@ void __init_memblock __next_reserved_mem_region(u64 *idx,
|
||||||
phys_addr_t *out_start,
|
phys_addr_t *out_start,
|
||||||
phys_addr_t *out_end)
|
phys_addr_t *out_end)
|
||||||
{
|
{
|
||||||
struct memblock_type *rsv = &memblock.reserved;
|
struct memblock_type *type = &memblock.reserved;
|
||||||
|
|
||||||
if (*idx >= 0 && *idx < rsv->cnt) {
|
if (*idx >= 0 && *idx < type->cnt) {
|
||||||
struct memblock_region *r = &rsv->regions[*idx];
|
struct memblock_region *r = &type->regions[*idx];
|
||||||
phys_addr_t base = r->base;
|
phys_addr_t base = r->base;
|
||||||
phys_addr_t size = r->size;
|
phys_addr_t size = r->size;
|
||||||
|
|
||||||
|
@ -975,7 +976,7 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags,
|
||||||
* in type_b.
|
* in type_b.
|
||||||
*
|
*
|
||||||
* @idx: pointer to u64 loop variable
|
* @idx: pointer to u64 loop variable
|
||||||
* @nid: nid: node selector, %NUMA_NO_NODE for all nodes
|
* @nid: node selector, %NUMA_NO_NODE for all nodes
|
||||||
* @flags: pick from blocks based on memory attributes
|
* @flags: pick from blocks based on memory attributes
|
||||||
* @type_a: pointer to memblock_type from where the range is taken
|
* @type_a: pointer to memblock_type from where the range is taken
|
||||||
* @type_b: pointer to memblock_type which excludes memory from being taken
|
* @type_b: pointer to memblock_type which excludes memory from being taken
|
||||||
|
@ -1565,12 +1566,12 @@ int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size
|
||||||
* Check if the region [@base, @base+@size) intersects a reserved memory block.
|
* Check if the region [@base, @base+@size) intersects a reserved memory block.
|
||||||
*
|
*
|
||||||
* RETURNS:
|
* RETURNS:
|
||||||
* 0 if false, non-zero if true
|
* True if they intersect, false if not.
|
||||||
*/
|
*/
|
||||||
int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
|
bool __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
|
||||||
{
|
{
|
||||||
memblock_cap_size(base, &size);
|
memblock_cap_size(base, &size);
|
||||||
return memblock_overlaps_region(&memblock.reserved, base, size) >= 0;
|
return memblock_overlaps_region(&memblock.reserved, base, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
void __init_memblock memblock_trim_memory(phys_addr_t align)
|
void __init_memblock memblock_trim_memory(phys_addr_t align)
|
||||||
|
|
390
mm/memcontrol.c
390
mm/memcontrol.c
|
@ -111,56 +111,10 @@ static const char * const mem_cgroup_lru_names[] = {
|
||||||
"unevictable",
|
"unevictable",
|
||||||
};
|
};
|
||||||
|
|
||||||
/*
|
|
||||||
* Per memcg event counter is incremented at every pagein/pageout. With THP,
|
|
||||||
* it will be incremated by the number of pages. This counter is used for
|
|
||||||
* for trigger some periodic events. This is straightforward and better
|
|
||||||
* than using jiffies etc. to handle periodic memcg event.
|
|
||||||
*/
|
|
||||||
enum mem_cgroup_events_target {
|
|
||||||
MEM_CGROUP_TARGET_THRESH,
|
|
||||||
MEM_CGROUP_TARGET_SOFTLIMIT,
|
|
||||||
MEM_CGROUP_TARGET_NUMAINFO,
|
|
||||||
MEM_CGROUP_NTARGETS,
|
|
||||||
};
|
|
||||||
#define THRESHOLDS_EVENTS_TARGET 128
|
#define THRESHOLDS_EVENTS_TARGET 128
|
||||||
#define SOFTLIMIT_EVENTS_TARGET 1024
|
#define SOFTLIMIT_EVENTS_TARGET 1024
|
||||||
#define NUMAINFO_EVENTS_TARGET 1024
|
#define NUMAINFO_EVENTS_TARGET 1024
|
||||||
|
|
||||||
struct mem_cgroup_stat_cpu {
|
|
||||||
long count[MEM_CGROUP_STAT_NSTATS];
|
|
||||||
unsigned long events[MEMCG_NR_EVENTS];
|
|
||||||
unsigned long nr_page_events;
|
|
||||||
unsigned long targets[MEM_CGROUP_NTARGETS];
|
|
||||||
};
|
|
||||||
|
|
||||||
struct reclaim_iter {
|
|
||||||
struct mem_cgroup *position;
|
|
||||||
/* scan generation, increased every round-trip */
|
|
||||||
unsigned int generation;
|
|
||||||
};
|
|
||||||
|
|
||||||
/*
|
|
||||||
* per-zone information in memory controller.
|
|
||||||
*/
|
|
||||||
struct mem_cgroup_per_zone {
|
|
||||||
struct lruvec lruvec;
|
|
||||||
unsigned long lru_size[NR_LRU_LISTS];
|
|
||||||
|
|
||||||
struct reclaim_iter iter[DEF_PRIORITY + 1];
|
|
||||||
|
|
||||||
struct rb_node tree_node; /* RB tree node */
|
|
||||||
unsigned long usage_in_excess;/* Set to the value by which */
|
|
||||||
/* the soft limit is exceeded*/
|
|
||||||
bool on_tree;
|
|
||||||
struct mem_cgroup *memcg; /* Back pointer, we cannot */
|
|
||||||
/* use container_of */
|
|
||||||
};
|
|
||||||
|
|
||||||
struct mem_cgroup_per_node {
|
|
||||||
struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
|
|
||||||
};
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Cgroups above their limits are maintained in a RB-Tree, independent of
|
* Cgroups above their limits are maintained in a RB-Tree, independent of
|
||||||
* their hierarchy representation
|
* their hierarchy representation
|
||||||
|
@ -181,32 +135,6 @@ struct mem_cgroup_tree {
|
||||||
|
|
||||||
static struct mem_cgroup_tree soft_limit_tree __read_mostly;
|
static struct mem_cgroup_tree soft_limit_tree __read_mostly;
|
||||||
|
|
||||||
struct mem_cgroup_threshold {
|
|
||||||
struct eventfd_ctx *eventfd;
|
|
||||||
unsigned long threshold;
|
|
||||||
};
|
|
||||||
|
|
||||||
/* For threshold */
|
|
||||||
struct mem_cgroup_threshold_ary {
|
|
||||||
/* An array index points to threshold just below or equal to usage. */
|
|
||||||
int current_threshold;
|
|
||||||
/* Size of entries[] */
|
|
||||||
unsigned int size;
|
|
||||||
/* Array of thresholds */
|
|
||||||
struct mem_cgroup_threshold entries[0];
|
|
||||||
};
|
|
||||||
|
|
||||||
struct mem_cgroup_thresholds {
|
|
||||||
/* Primary thresholds array */
|
|
||||||
struct mem_cgroup_threshold_ary *primary;
|
|
||||||
/*
|
|
||||||
* Spare threshold array.
|
|
||||||
* This is needed to make mem_cgroup_unregister_event() "never fail".
|
|
||||||
* It must be able to store at least primary->size - 1 entries.
|
|
||||||
*/
|
|
||||||
struct mem_cgroup_threshold_ary *spare;
|
|
||||||
};
|
|
||||||
|
|
||||||
/* for OOM */
|
/* for OOM */
|
||||||
struct mem_cgroup_eventfd_list {
|
struct mem_cgroup_eventfd_list {
|
||||||
struct list_head list;
|
struct list_head list;
|
||||||
|
@ -256,113 +184,6 @@ struct mem_cgroup_event {
|
||||||
static void mem_cgroup_threshold(struct mem_cgroup *memcg);
|
static void mem_cgroup_threshold(struct mem_cgroup *memcg);
|
||||||
static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
|
static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
|
||||||
|
|
||||||
/*
|
|
||||||
* The memory controller data structure. The memory controller controls both
|
|
||||||
* page cache and RSS per cgroup. We would eventually like to provide
|
|
||||||
* statistics based on the statistics developed by Rik Van Riel for clock-pro,
|
|
||||||
* to help the administrator determine what knobs to tune.
|
|
||||||
*/
|
|
||||||
struct mem_cgroup {
|
|
||||||
struct cgroup_subsys_state css;
|
|
||||||
|
|
||||||
/* Accounted resources */
|
|
||||||
struct page_counter memory;
|
|
||||||
struct page_counter memsw;
|
|
||||||
struct page_counter kmem;
|
|
||||||
|
|
||||||
/* Normal memory consumption range */
|
|
||||||
unsigned long low;
|
|
||||||
unsigned long high;
|
|
||||||
|
|
||||||
unsigned long soft_limit;
|
|
||||||
|
|
||||||
/* vmpressure notifications */
|
|
||||||
struct vmpressure vmpressure;
|
|
||||||
|
|
||||||
/* css_online() has been completed */
|
|
||||||
int initialized;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Should the accounting and control be hierarchical, per subtree?
|
|
||||||
*/
|
|
||||||
bool use_hierarchy;
|
|
||||||
|
|
||||||
/* protected by memcg_oom_lock */
|
|
||||||
bool oom_lock;
|
|
||||||
int under_oom;
|
|
||||||
|
|
||||||
int swappiness;
|
|
||||||
/* OOM-Killer disable */
|
|
||||||
int oom_kill_disable;
|
|
||||||
|
|
||||||
/* protect arrays of thresholds */
|
|
||||||
struct mutex thresholds_lock;
|
|
||||||
|
|
||||||
/* thresholds for memory usage. RCU-protected */
|
|
||||||
struct mem_cgroup_thresholds thresholds;
|
|
||||||
|
|
||||||
/* thresholds for mem+swap usage. RCU-protected */
|
|
||||||
struct mem_cgroup_thresholds memsw_thresholds;
|
|
||||||
|
|
||||||
/* For oom notifier event fd */
|
|
||||||
struct list_head oom_notify;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Should we move charges of a task when a task is moved into this
|
|
||||||
* mem_cgroup ? And what type of charges should we move ?
|
|
||||||
*/
|
|
||||||
unsigned long move_charge_at_immigrate;
|
|
||||||
/*
|
|
||||||
* set > 0 if pages under this cgroup are moving to other cgroup.
|
|
||||||
*/
|
|
||||||
atomic_t moving_account;
|
|
||||||
/* taken only while moving_account > 0 */
|
|
||||||
spinlock_t move_lock;
|
|
||||||
struct task_struct *move_lock_task;
|
|
||||||
unsigned long move_lock_flags;
|
|
||||||
/*
|
|
||||||
* percpu counter.
|
|
||||||
*/
|
|
||||||
struct mem_cgroup_stat_cpu __percpu *stat;
|
|
||||||
spinlock_t pcp_counter_lock;
|
|
||||||
|
|
||||||
#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
|
|
||||||
struct cg_proto tcp_mem;
|
|
||||||
#endif
|
|
||||||
#if defined(CONFIG_MEMCG_KMEM)
|
|
||||||
/* Index in the kmem_cache->memcg_params.memcg_caches array */
|
|
||||||
int kmemcg_id;
|
|
||||||
bool kmem_acct_activated;
|
|
||||||
bool kmem_acct_active;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
int last_scanned_node;
|
|
||||||
#if MAX_NUMNODES > 1
|
|
||||||
nodemask_t scan_nodes;
|
|
||||||
atomic_t numainfo_events;
|
|
||||||
atomic_t numainfo_updating;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef CONFIG_CGROUP_WRITEBACK
|
|
||||||
struct list_head cgwb_list;
|
|
||||||
struct wb_domain cgwb_domain;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* List of events which userspace want to receive */
|
|
||||||
struct list_head event_list;
|
|
||||||
spinlock_t event_list_lock;
|
|
||||||
|
|
||||||
struct mem_cgroup_per_node *nodeinfo[0];
|
|
||||||
/* WARNING: nodeinfo must be the last member here */
|
|
||||||
};
|
|
||||||
|
|
||||||
#ifdef CONFIG_MEMCG_KMEM
|
|
||||||
bool memcg_kmem_is_active(struct mem_cgroup *memcg)
|
|
||||||
{
|
|
||||||
return memcg->kmem_acct_active;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* Stuffs for move charges at task migration. */
|
/* Stuffs for move charges at task migration. */
|
||||||
/*
|
/*
|
||||||
* Types of charges to be moved.
|
* Types of charges to be moved.
|
||||||
|
@ -423,11 +244,6 @@ enum res_type {
|
||||||
*/
|
*/
|
||||||
static DEFINE_MUTEX(memcg_create_mutex);
|
static DEFINE_MUTEX(memcg_create_mutex);
|
||||||
|
|
||||||
struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
|
|
||||||
{
|
|
||||||
return s ? container_of(s, struct mem_cgroup, css) : NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Some nice accessors for the vmpressure. */
|
/* Some nice accessors for the vmpressure. */
|
||||||
struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
|
struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
|
||||||
{
|
{
|
||||||
|
@ -499,8 +315,7 @@ void sock_update_memcg(struct sock *sk)
|
||||||
rcu_read_lock();
|
rcu_read_lock();
|
||||||
memcg = mem_cgroup_from_task(current);
|
memcg = mem_cgroup_from_task(current);
|
||||||
cg_proto = sk->sk_prot->proto_cgroup(memcg);
|
cg_proto = sk->sk_prot->proto_cgroup(memcg);
|
||||||
if (!mem_cgroup_is_root(memcg) &&
|
if (cg_proto && test_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags) &&
|
||||||
memcg_proto_active(cg_proto) &&
|
|
||||||
css_tryget_online(&memcg->css)) {
|
css_tryget_online(&memcg->css)) {
|
||||||
sk->sk_cgrp = cg_proto;
|
sk->sk_cgrp = cg_proto;
|
||||||
}
|
}
|
||||||
|
@ -593,11 +408,6 @@ mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
|
||||||
return &memcg->nodeinfo[nid]->zoneinfo[zid];
|
return &memcg->nodeinfo[nid]->zoneinfo[zid];
|
||||||
}
|
}
|
||||||
|
|
||||||
struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
|
|
||||||
{
|
|
||||||
return &memcg->css;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* mem_cgroup_css_from_page - css of the memcg associated with a page
|
* mem_cgroup_css_from_page - css of the memcg associated with a page
|
||||||
* @page: page of interest
|
* @page: page of interest
|
||||||
|
@ -876,14 +686,6 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
|
||||||
__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
|
__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
|
|
||||||
{
|
|
||||||
struct mem_cgroup_per_zone *mz;
|
|
||||||
|
|
||||||
mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
|
|
||||||
return mz->lru_size[lru];
|
|
||||||
}
|
|
||||||
|
|
||||||
static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
|
static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
|
||||||
int nid,
|
int nid,
|
||||||
unsigned int lru_mask)
|
unsigned int lru_mask)
|
||||||
|
@ -986,6 +788,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
|
||||||
|
|
||||||
return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
|
return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
|
||||||
}
|
}
|
||||||
|
EXPORT_SYMBOL(mem_cgroup_from_task);
|
||||||
|
|
||||||
static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
|
static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
|
||||||
{
|
{
|
||||||
|
@ -1031,7 +834,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
|
||||||
struct mem_cgroup *prev,
|
struct mem_cgroup *prev,
|
||||||
struct mem_cgroup_reclaim_cookie *reclaim)
|
struct mem_cgroup_reclaim_cookie *reclaim)
|
||||||
{
|
{
|
||||||
struct reclaim_iter *uninitialized_var(iter);
|
struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
|
||||||
struct cgroup_subsys_state *css = NULL;
|
struct cgroup_subsys_state *css = NULL;
|
||||||
struct mem_cgroup *memcg = NULL;
|
struct mem_cgroup *memcg = NULL;
|
||||||
struct mem_cgroup *pos = NULL;
|
struct mem_cgroup *pos = NULL;
|
||||||
|
@ -1173,30 +976,6 @@ void mem_cgroup_iter_break(struct mem_cgroup *root,
|
||||||
iter != NULL; \
|
iter != NULL; \
|
||||||
iter = mem_cgroup_iter(NULL, iter, NULL))
|
iter = mem_cgroup_iter(NULL, iter, NULL))
|
||||||
|
|
||||||
void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
|
|
||||||
{
|
|
||||||
struct mem_cgroup *memcg;
|
|
||||||
|
|
||||||
rcu_read_lock();
|
|
||||||
memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
|
|
||||||
if (unlikely(!memcg))
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
switch (idx) {
|
|
||||||
case PGFAULT:
|
|
||||||
this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
|
|
||||||
break;
|
|
||||||
case PGMAJFAULT:
|
|
||||||
this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
BUG();
|
|
||||||
}
|
|
||||||
out:
|
|
||||||
rcu_read_unlock();
|
|
||||||
}
|
|
||||||
EXPORT_SYMBOL(__mem_cgroup_count_vm_event);
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
|
* mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
|
||||||
* @zone: zone of the wanted lruvec
|
* @zone: zone of the wanted lruvec
|
||||||
|
@ -1295,15 +1074,6 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
|
||||||
VM_BUG_ON((long)(*lru_size) < 0);
|
VM_BUG_ON((long)(*lru_size) < 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, struct mem_cgroup *root)
|
|
||||||
{
|
|
||||||
if (root == memcg)
|
|
||||||
return true;
|
|
||||||
if (!root->use_hierarchy)
|
|
||||||
return false;
|
|
||||||
return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
|
bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
|
||||||
{
|
{
|
||||||
struct mem_cgroup *task_memcg;
|
struct mem_cgroup *task_memcg;
|
||||||
|
@ -1330,39 +1100,6 @@ bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
|
|
||||||
{
|
|
||||||
unsigned long inactive_ratio;
|
|
||||||
unsigned long inactive;
|
|
||||||
unsigned long active;
|
|
||||||
unsigned long gb;
|
|
||||||
|
|
||||||
inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
|
|
||||||
active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
|
|
||||||
|
|
||||||
gb = (inactive + active) >> (30 - PAGE_SHIFT);
|
|
||||||
if (gb)
|
|
||||||
inactive_ratio = int_sqrt(10 * gb);
|
|
||||||
else
|
|
||||||
inactive_ratio = 1;
|
|
||||||
|
|
||||||
return inactive * inactive_ratio < active;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool mem_cgroup_lruvec_online(struct lruvec *lruvec)
|
|
||||||
{
|
|
||||||
struct mem_cgroup_per_zone *mz;
|
|
||||||
struct mem_cgroup *memcg;
|
|
||||||
|
|
||||||
if (mem_cgroup_disabled())
|
|
||||||
return true;
|
|
||||||
|
|
||||||
mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
|
|
||||||
memcg = mz->memcg;
|
|
||||||
|
|
||||||
return !!(memcg->css.flags & CSS_ONLINE);
|
|
||||||
}
|
|
||||||
|
|
||||||
#define mem_cgroup_from_counter(counter, member) \
|
#define mem_cgroup_from_counter(counter, member) \
|
||||||
container_of(counter, struct mem_cgroup, member)
|
container_of(counter, struct mem_cgroup, member)
|
||||||
|
|
||||||
|
@ -1394,15 +1131,6 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
|
||||||
return margin;
|
return margin;
|
||||||
}
|
}
|
||||||
|
|
||||||
int mem_cgroup_swappiness(struct mem_cgroup *memcg)
|
|
||||||
{
|
|
||||||
/* root ? */
|
|
||||||
if (mem_cgroup_disabled() || !memcg->css.parent)
|
|
||||||
return vm_swappiness;
|
|
||||||
|
|
||||||
return memcg->swappiness;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* A routine for checking "mem" is under move_account() or not.
|
* A routine for checking "mem" is under move_account() or not.
|
||||||
*
|
*
|
||||||
|
@ -1545,6 +1273,12 @@ static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
|
||||||
static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
|
static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
|
||||||
int order)
|
int order)
|
||||||
{
|
{
|
||||||
|
struct oom_control oc = {
|
||||||
|
.zonelist = NULL,
|
||||||
|
.nodemask = NULL,
|
||||||
|
.gfp_mask = gfp_mask,
|
||||||
|
.order = order,
|
||||||
|
};
|
||||||
struct mem_cgroup *iter;
|
struct mem_cgroup *iter;
|
||||||
unsigned long chosen_points = 0;
|
unsigned long chosen_points = 0;
|
||||||
unsigned long totalpages;
|
unsigned long totalpages;
|
||||||
|
@ -1563,7 +1297,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
|
||||||
goto unlock;
|
goto unlock;
|
||||||
}
|
}
|
||||||
|
|
||||||
check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg);
|
check_panic_on_oom(&oc, CONSTRAINT_MEMCG, memcg);
|
||||||
totalpages = mem_cgroup_get_limit(memcg) ? : 1;
|
totalpages = mem_cgroup_get_limit(memcg) ? : 1;
|
||||||
for_each_mem_cgroup_tree(iter, memcg) {
|
for_each_mem_cgroup_tree(iter, memcg) {
|
||||||
struct css_task_iter it;
|
struct css_task_iter it;
|
||||||
|
@ -1571,8 +1305,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
|
||||||
|
|
||||||
css_task_iter_start(&iter->css, &it);
|
css_task_iter_start(&iter->css, &it);
|
||||||
while ((task = css_task_iter_next(&it))) {
|
while ((task = css_task_iter_next(&it))) {
|
||||||
switch (oom_scan_process_thread(task, totalpages, NULL,
|
switch (oom_scan_process_thread(&oc, task, totalpages)) {
|
||||||
false)) {
|
|
||||||
case OOM_SCAN_SELECT:
|
case OOM_SCAN_SELECT:
|
||||||
if (chosen)
|
if (chosen)
|
||||||
put_task_struct(chosen);
|
put_task_struct(chosen);
|
||||||
|
@ -1610,8 +1343,8 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
|
||||||
|
|
||||||
if (chosen) {
|
if (chosen) {
|
||||||
points = chosen_points * 1000 / totalpages;
|
points = chosen_points * 1000 / totalpages;
|
||||||
oom_kill_process(chosen, gfp_mask, order, points, totalpages,
|
oom_kill_process(&oc, chosen, points, totalpages, memcg,
|
||||||
memcg, NULL, "Memory cgroup out of memory");
|
"Memory cgroup out of memory");
|
||||||
}
|
}
|
||||||
unlock:
|
unlock:
|
||||||
mutex_unlock(&oom_lock);
|
mutex_unlock(&oom_lock);
|
||||||
|
@ -2062,23 +1795,6 @@ void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL(mem_cgroup_end_page_stat);
|
EXPORT_SYMBOL(mem_cgroup_end_page_stat);
|
||||||
|
|
||||||
/**
|
|
||||||
* mem_cgroup_update_page_stat - update page state statistics
|
|
||||||
* @memcg: memcg to account against
|
|
||||||
* @idx: page state item to account
|
|
||||||
* @val: number of pages (positive or negative)
|
|
||||||
*
|
|
||||||
* See mem_cgroup_begin_page_stat() for locking requirements.
|
|
||||||
*/
|
|
||||||
void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
|
|
||||||
enum mem_cgroup_stat_index idx, int val)
|
|
||||||
{
|
|
||||||
VM_BUG_ON(!rcu_read_lock_held());
|
|
||||||
|
|
||||||
if (memcg)
|
|
||||||
this_cpu_add(memcg->stat->count[idx], val);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* size of first charge trial. "32" comes from vmscan.c's magic value.
|
* size of first charge trial. "32" comes from vmscan.c's magic value.
|
||||||
* TODO: maybe necessary to use big numbers in big irons.
|
* TODO: maybe necessary to use big numbers in big irons.
|
||||||
|
@ -2504,16 +2220,6 @@ void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages)
|
||||||
css_put_many(&memcg->css, nr_pages);
|
css_put_many(&memcg->css, nr_pages);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* helper for acessing a memcg's index. It will be used as an index in the
|
|
||||||
* child cache array in kmem_cache, and also to derive its name. This function
|
|
||||||
* will return -1 when this is not a kmem-limited memcg.
|
|
||||||
*/
|
|
||||||
int memcg_cache_id(struct mem_cgroup *memcg)
|
|
||||||
{
|
|
||||||
return memcg ? memcg->kmemcg_id : -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int memcg_alloc_cache_id(void)
|
static int memcg_alloc_cache_id(void)
|
||||||
{
|
{
|
||||||
int id, size;
|
int id, size;
|
||||||
|
@ -5127,10 +4833,12 @@ static void mem_cgroup_clear_mc(void)
|
||||||
static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
|
static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
|
||||||
struct cgroup_taskset *tset)
|
struct cgroup_taskset *tset)
|
||||||
{
|
{
|
||||||
struct task_struct *p = cgroup_taskset_first(tset);
|
|
||||||
int ret = 0;
|
|
||||||
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
|
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
|
||||||
|
struct mem_cgroup *from;
|
||||||
|
struct task_struct *p;
|
||||||
|
struct mm_struct *mm;
|
||||||
unsigned long move_flags;
|
unsigned long move_flags;
|
||||||
|
int ret = 0;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We are now commited to this value whatever it is. Changes in this
|
* We are now commited to this value whatever it is. Changes in this
|
||||||
|
@ -5138,36 +4846,37 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
|
||||||
* So we need to save it, and keep it going.
|
* So we need to save it, and keep it going.
|
||||||
*/
|
*/
|
||||||
move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
|
move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
|
||||||
if (move_flags) {
|
if (!move_flags)
|
||||||
struct mm_struct *mm;
|
return 0;
|
||||||
struct mem_cgroup *from = mem_cgroup_from_task(p);
|
|
||||||
|
|
||||||
VM_BUG_ON(from == memcg);
|
p = cgroup_taskset_first(tset);
|
||||||
|
from = mem_cgroup_from_task(p);
|
||||||
|
|
||||||
mm = get_task_mm(p);
|
VM_BUG_ON(from == memcg);
|
||||||
if (!mm)
|
|
||||||
return 0;
|
|
||||||
/* We move charges only when we move a owner of the mm */
|
|
||||||
if (mm->owner == p) {
|
|
||||||
VM_BUG_ON(mc.from);
|
|
||||||
VM_BUG_ON(mc.to);
|
|
||||||
VM_BUG_ON(mc.precharge);
|
|
||||||
VM_BUG_ON(mc.moved_charge);
|
|
||||||
VM_BUG_ON(mc.moved_swap);
|
|
||||||
|
|
||||||
spin_lock(&mc.lock);
|
mm = get_task_mm(p);
|
||||||
mc.from = from;
|
if (!mm)
|
||||||
mc.to = memcg;
|
return 0;
|
||||||
mc.flags = move_flags;
|
/* We move charges only when we move a owner of the mm */
|
||||||
spin_unlock(&mc.lock);
|
if (mm->owner == p) {
|
||||||
/* We set mc.moving_task later */
|
VM_BUG_ON(mc.from);
|
||||||
|
VM_BUG_ON(mc.to);
|
||||||
|
VM_BUG_ON(mc.precharge);
|
||||||
|
VM_BUG_ON(mc.moved_charge);
|
||||||
|
VM_BUG_ON(mc.moved_swap);
|
||||||
|
|
||||||
ret = mem_cgroup_precharge_mc(mm);
|
spin_lock(&mc.lock);
|
||||||
if (ret)
|
mc.from = from;
|
||||||
mem_cgroup_clear_mc();
|
mc.to = memcg;
|
||||||
}
|
mc.flags = move_flags;
|
||||||
mmput(mm);
|
spin_unlock(&mc.lock);
|
||||||
|
/* We set mc.moving_task later */
|
||||||
|
|
||||||
|
ret = mem_cgroup_precharge_mc(mm);
|
||||||
|
if (ret)
|
||||||
|
mem_cgroup_clear_mc();
|
||||||
}
|
}
|
||||||
|
mmput(mm);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -5520,19 +5229,6 @@ struct cgroup_subsys memory_cgrp_subsys = {
|
||||||
.early_init = 0,
|
.early_init = 0,
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
|
||||||
* mem_cgroup_events - count memory events against a cgroup
|
|
||||||
* @memcg: the memory cgroup
|
|
||||||
* @idx: the event index
|
|
||||||
* @nr: the number of events to account for
|
|
||||||
*/
|
|
||||||
void mem_cgroup_events(struct mem_cgroup *memcg,
|
|
||||||
enum mem_cgroup_events_index idx,
|
|
||||||
unsigned int nr)
|
|
||||||
{
|
|
||||||
this_cpu_add(memcg->stat->events[idx], nr);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* mem_cgroup_low - check if memory consumption is below the normal range
|
* mem_cgroup_low - check if memory consumption is below the normal range
|
||||||
* @root: the highest ancestor to consider
|
* @root: the highest ancestor to consider
|
||||||
|
|
|
@ -146,7 +146,7 @@ static int hwpoison_filter_task(struct page *p)
|
||||||
if (!mem)
|
if (!mem)
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
|
|
||||||
css = mem_cgroup_css(mem);
|
css = &mem->css;
|
||||||
ino = cgroup_ino(css->cgroup);
|
ino = cgroup_ino(css->cgroup);
|
||||||
css_put(css);
|
css_put(css);
|
||||||
|
|
||||||
|
@ -934,6 +934,27 @@ int get_hwpoison_page(struct page *page)
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL_GPL(get_hwpoison_page);
|
EXPORT_SYMBOL_GPL(get_hwpoison_page);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* put_hwpoison_page() - Put refcount for memory error handling:
|
||||||
|
* @page: raw error page (hit by memory error)
|
||||||
|
*/
|
||||||
|
void put_hwpoison_page(struct page *page)
|
||||||
|
{
|
||||||
|
struct page *head = compound_head(page);
|
||||||
|
|
||||||
|
if (PageHuge(head)) {
|
||||||
|
put_page(head);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (PageTransHuge(head))
|
||||||
|
if (page != head)
|
||||||
|
put_page(head);
|
||||||
|
|
||||||
|
put_page(page);
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL_GPL(put_hwpoison_page);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Do all that is necessary to remove user space mappings. Unmap
|
* Do all that is necessary to remove user space mappings. Unmap
|
||||||
* the pages and send SIGBUS to the processes if the data was dirty.
|
* the pages and send SIGBUS to the processes if the data was dirty.
|
||||||
|
@ -1100,7 +1121,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
|
||||||
nr_pages = 1 << compound_order(hpage);
|
nr_pages = 1 << compound_order(hpage);
|
||||||
else /* normal page or thp */
|
else /* normal page or thp */
|
||||||
nr_pages = 1;
|
nr_pages = 1;
|
||||||
atomic_long_add(nr_pages, &num_poisoned_pages);
|
num_poisoned_pages_add(nr_pages);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We need/can do nothing about count=0 pages.
|
* We need/can do nothing about count=0 pages.
|
||||||
|
@ -1128,7 +1149,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
|
||||||
if (PageHWPoison(hpage)) {
|
if (PageHWPoison(hpage)) {
|
||||||
if ((hwpoison_filter(p) && TestClearPageHWPoison(p))
|
if ((hwpoison_filter(p) && TestClearPageHWPoison(p))
|
||||||
|| (p != hpage && TestSetPageHWPoison(hpage))) {
|
|| (p != hpage && TestSetPageHWPoison(hpage))) {
|
||||||
atomic_long_sub(nr_pages, &num_poisoned_pages);
|
num_poisoned_pages_sub(nr_pages);
|
||||||
unlock_page(hpage);
|
unlock_page(hpage);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -1152,10 +1173,8 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
|
||||||
else
|
else
|
||||||
pr_err("MCE: %#lx: thp split failed\n", pfn);
|
pr_err("MCE: %#lx: thp split failed\n", pfn);
|
||||||
if (TestClearPageHWPoison(p))
|
if (TestClearPageHWPoison(p))
|
||||||
atomic_long_sub(nr_pages, &num_poisoned_pages);
|
num_poisoned_pages_sub(nr_pages);
|
||||||
put_page(p);
|
put_hwpoison_page(p);
|
||||||
if (p != hpage)
|
|
||||||
put_page(hpage);
|
|
||||||
return -EBUSY;
|
return -EBUSY;
|
||||||
}
|
}
|
||||||
VM_BUG_ON_PAGE(!page_count(p), p);
|
VM_BUG_ON_PAGE(!page_count(p), p);
|
||||||
|
@ -1214,16 +1233,16 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
|
||||||
*/
|
*/
|
||||||
if (!PageHWPoison(p)) {
|
if (!PageHWPoison(p)) {
|
||||||
printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
|
printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
|
||||||
atomic_long_sub(nr_pages, &num_poisoned_pages);
|
num_poisoned_pages_sub(nr_pages);
|
||||||
unlock_page(hpage);
|
unlock_page(hpage);
|
||||||
put_page(hpage);
|
put_hwpoison_page(hpage);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
if (hwpoison_filter(p)) {
|
if (hwpoison_filter(p)) {
|
||||||
if (TestClearPageHWPoison(p))
|
if (TestClearPageHWPoison(p))
|
||||||
atomic_long_sub(nr_pages, &num_poisoned_pages);
|
num_poisoned_pages_sub(nr_pages);
|
||||||
unlock_page(hpage);
|
unlock_page(hpage);
|
||||||
put_page(hpage);
|
put_hwpoison_page(hpage);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1237,7 +1256,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
|
||||||
if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
|
if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
|
||||||
action_result(pfn, MF_MSG_POISONED_HUGE, MF_IGNORED);
|
action_result(pfn, MF_MSG_POISONED_HUGE, MF_IGNORED);
|
||||||
unlock_page(hpage);
|
unlock_page(hpage);
|
||||||
put_page(hpage);
|
put_hwpoison_page(hpage);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
/*
|
/*
|
||||||
|
@ -1426,6 +1445,22 @@ int unpoison_memory(unsigned long pfn)
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (page_count(page) > 1) {
|
||||||
|
pr_info("MCE: Someone grabs the hwpoison page %#lx\n", pfn);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (page_mapped(page)) {
|
||||||
|
pr_info("MCE: Someone maps the hwpoison page %#lx\n", pfn);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (page_mapping(page)) {
|
||||||
|
pr_info("MCE: the hwpoison page has non-NULL mapping %#lx\n",
|
||||||
|
pfn);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* unpoison_memory() can encounter thp only when the thp is being
|
* unpoison_memory() can encounter thp only when the thp is being
|
||||||
* worked by memory_failure() and the page lock is not held yet.
|
* worked by memory_failure() and the page lock is not held yet.
|
||||||
|
@ -1450,7 +1485,7 @@ int unpoison_memory(unsigned long pfn)
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
if (TestClearPageHWPoison(p))
|
if (TestClearPageHWPoison(p))
|
||||||
atomic_long_dec(&num_poisoned_pages);
|
num_poisoned_pages_dec();
|
||||||
pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
|
pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -1464,16 +1499,16 @@ int unpoison_memory(unsigned long pfn)
|
||||||
*/
|
*/
|
||||||
if (TestClearPageHWPoison(page)) {
|
if (TestClearPageHWPoison(page)) {
|
||||||
pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
|
pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
|
||||||
atomic_long_sub(nr_pages, &num_poisoned_pages);
|
num_poisoned_pages_sub(nr_pages);
|
||||||
freeit = 1;
|
freeit = 1;
|
||||||
if (PageHuge(page))
|
if (PageHuge(page))
|
||||||
clear_page_hwpoison_huge_page(page);
|
clear_page_hwpoison_huge_page(page);
|
||||||
}
|
}
|
||||||
unlock_page(page);
|
unlock_page(page);
|
||||||
|
|
||||||
put_page(page);
|
put_hwpoison_page(page);
|
||||||
if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))
|
if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))
|
||||||
put_page(page);
|
put_hwpoison_page(page);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -1486,7 +1521,7 @@ static struct page *new_page(struct page *p, unsigned long private, int **x)
|
||||||
return alloc_huge_page_node(page_hstate(compound_head(p)),
|
return alloc_huge_page_node(page_hstate(compound_head(p)),
|
||||||
nid);
|
nid);
|
||||||
else
|
else
|
||||||
return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
|
return __alloc_pages_node(nid, GFP_HIGHUSER_MOVABLE, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -1533,7 +1568,7 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags)
|
||||||
/*
|
/*
|
||||||
* Try to free it.
|
* Try to free it.
|
||||||
*/
|
*/
|
||||||
put_page(page);
|
put_hwpoison_page(page);
|
||||||
shake_page(page, 1);
|
shake_page(page, 1);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -1542,7 +1577,7 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags)
|
||||||
ret = __get_any_page(page, pfn, 0);
|
ret = __get_any_page(page, pfn, 0);
|
||||||
if (!PageLRU(page)) {
|
if (!PageLRU(page)) {
|
||||||
/* Drop page reference which is from __get_any_page() */
|
/* Drop page reference which is from __get_any_page() */
|
||||||
put_page(page);
|
put_hwpoison_page(page);
|
||||||
pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
|
pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
|
||||||
pfn, page->flags);
|
pfn, page->flags);
|
||||||
return -EIO;
|
return -EIO;
|
||||||
|
@ -1565,7 +1600,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
|
||||||
lock_page(hpage);
|
lock_page(hpage);
|
||||||
if (PageHWPoison(hpage)) {
|
if (PageHWPoison(hpage)) {
|
||||||
unlock_page(hpage);
|
unlock_page(hpage);
|
||||||
put_page(hpage);
|
put_hwpoison_page(hpage);
|
||||||
pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
|
pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
|
||||||
return -EBUSY;
|
return -EBUSY;
|
||||||
}
|
}
|
||||||
|
@ -1576,7 +1611,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
|
||||||
* get_any_page() and isolate_huge_page() takes a refcount each,
|
* get_any_page() and isolate_huge_page() takes a refcount each,
|
||||||
* so need to drop one here.
|
* so need to drop one here.
|
||||||
*/
|
*/
|
||||||
put_page(hpage);
|
put_hwpoison_page(hpage);
|
||||||
if (!ret) {
|
if (!ret) {
|
||||||
pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn);
|
pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn);
|
||||||
return -EBUSY;
|
return -EBUSY;
|
||||||
|
@ -1600,11 +1635,10 @@ static int soft_offline_huge_page(struct page *page, int flags)
|
||||||
if (PageHuge(page)) {
|
if (PageHuge(page)) {
|
||||||
set_page_hwpoison_huge_page(hpage);
|
set_page_hwpoison_huge_page(hpage);
|
||||||
dequeue_hwpoisoned_huge_page(hpage);
|
dequeue_hwpoisoned_huge_page(hpage);
|
||||||
atomic_long_add(1 << compound_order(hpage),
|
num_poisoned_pages_add(1 << compound_order(hpage));
|
||||||
&num_poisoned_pages);
|
|
||||||
} else {
|
} else {
|
||||||
SetPageHWPoison(page);
|
SetPageHWPoison(page);
|
||||||
atomic_long_inc(&num_poisoned_pages);
|
num_poisoned_pages_inc();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
|
@ -1625,7 +1659,7 @@ static int __soft_offline_page(struct page *page, int flags)
|
||||||
wait_on_page_writeback(page);
|
wait_on_page_writeback(page);
|
||||||
if (PageHWPoison(page)) {
|
if (PageHWPoison(page)) {
|
||||||
unlock_page(page);
|
unlock_page(page);
|
||||||
put_page(page);
|
put_hwpoison_page(page);
|
||||||
pr_info("soft offline: %#lx page already poisoned\n", pfn);
|
pr_info("soft offline: %#lx page already poisoned\n", pfn);
|
||||||
return -EBUSY;
|
return -EBUSY;
|
||||||
}
|
}
|
||||||
|
@ -1640,10 +1674,10 @@ static int __soft_offline_page(struct page *page, int flags)
|
||||||
* would need to fix isolation locking first.
|
* would need to fix isolation locking first.
|
||||||
*/
|
*/
|
||||||
if (ret == 1) {
|
if (ret == 1) {
|
||||||
put_page(page);
|
put_hwpoison_page(page);
|
||||||
pr_info("soft_offline: %#lx: invalidated\n", pfn);
|
pr_info("soft_offline: %#lx: invalidated\n", pfn);
|
||||||
SetPageHWPoison(page);
|
SetPageHWPoison(page);
|
||||||
atomic_long_inc(&num_poisoned_pages);
|
num_poisoned_pages_inc();
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1657,14 +1691,12 @@ static int __soft_offline_page(struct page *page, int flags)
|
||||||
* Drop page reference which is came from get_any_page()
|
* Drop page reference which is came from get_any_page()
|
||||||
* successful isolate_lru_page() already took another one.
|
* successful isolate_lru_page() already took another one.
|
||||||
*/
|
*/
|
||||||
put_page(page);
|
put_hwpoison_page(page);
|
||||||
if (!ret) {
|
if (!ret) {
|
||||||
LIST_HEAD(pagelist);
|
LIST_HEAD(pagelist);
|
||||||
inc_zone_page_state(page, NR_ISOLATED_ANON +
|
inc_zone_page_state(page, NR_ISOLATED_ANON +
|
||||||
page_is_file_cache(page));
|
page_is_file_cache(page));
|
||||||
list_add(&page->lru, &pagelist);
|
list_add(&page->lru, &pagelist);
|
||||||
if (!TestSetPageHWPoison(page))
|
|
||||||
atomic_long_inc(&num_poisoned_pages);
|
|
||||||
ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
|
ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
|
||||||
MIGRATE_SYNC, MR_MEMORY_FAILURE);
|
MIGRATE_SYNC, MR_MEMORY_FAILURE);
|
||||||
if (ret) {
|
if (ret) {
|
||||||
|
@ -1679,8 +1711,6 @@ static int __soft_offline_page(struct page *page, int flags)
|
||||||
pfn, ret, page->flags);
|
pfn, ret, page->flags);
|
||||||
if (ret > 0)
|
if (ret > 0)
|
||||||
ret = -EIO;
|
ret = -EIO;
|
||||||
if (TestClearPageHWPoison(page))
|
|
||||||
atomic_long_dec(&num_poisoned_pages);
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
|
pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
|
||||||
|
@ -1719,12 +1749,16 @@ int soft_offline_page(struct page *page, int flags)
|
||||||
|
|
||||||
if (PageHWPoison(page)) {
|
if (PageHWPoison(page)) {
|
||||||
pr_info("soft offline: %#lx page already poisoned\n", pfn);
|
pr_info("soft offline: %#lx page already poisoned\n", pfn);
|
||||||
|
if (flags & MF_COUNT_INCREASED)
|
||||||
|
put_hwpoison_page(page);
|
||||||
return -EBUSY;
|
return -EBUSY;
|
||||||
}
|
}
|
||||||
if (!PageHuge(page) && PageTransHuge(hpage)) {
|
if (!PageHuge(page) && PageTransHuge(hpage)) {
|
||||||
if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
|
if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
|
||||||
pr_info("soft offline: %#lx: failed to split THP\n",
|
pr_info("soft offline: %#lx: failed to split THP\n",
|
||||||
pfn);
|
pfn);
|
||||||
|
if (flags & MF_COUNT_INCREASED)
|
||||||
|
put_hwpoison_page(page);
|
||||||
return -EBUSY;
|
return -EBUSY;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1742,11 +1776,10 @@ int soft_offline_page(struct page *page, int flags)
|
||||||
if (PageHuge(page)) {
|
if (PageHuge(page)) {
|
||||||
set_page_hwpoison_huge_page(hpage);
|
set_page_hwpoison_huge_page(hpage);
|
||||||
if (!dequeue_hwpoisoned_huge_page(hpage))
|
if (!dequeue_hwpoisoned_huge_page(hpage))
|
||||||
atomic_long_add(1 << compound_order(hpage),
|
num_poisoned_pages_add(1 << compound_order(hpage));
|
||||||
&num_poisoned_pages);
|
|
||||||
} else {
|
} else {
|
||||||
if (!TestSetPageHWPoison(page))
|
if (!TestSetPageHWPoison(page))
|
||||||
atomic_long_inc(&num_poisoned_pages);
|
num_poisoned_pages_inc();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
|
|
48
mm/memory.c
48
mm/memory.c
|
@ -2426,8 +2426,6 @@ void unmap_mapping_range(struct address_space *mapping,
|
||||||
if (details.last_index < details.first_index)
|
if (details.last_index < details.first_index)
|
||||||
details.last_index = ULONG_MAX;
|
details.last_index = ULONG_MAX;
|
||||||
|
|
||||||
|
|
||||||
/* DAX uses i_mmap_lock to serialise file truncate vs page fault */
|
|
||||||
i_mmap_lock_write(mapping);
|
i_mmap_lock_write(mapping);
|
||||||
if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
|
if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
|
||||||
unmap_mapping_range_tree(&mapping->i_mmap, &details);
|
unmap_mapping_range_tree(&mapping->i_mmap, &details);
|
||||||
|
@ -3015,9 +3013,9 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||||
} else {
|
} else {
|
||||||
/*
|
/*
|
||||||
* The fault handler has no page to lock, so it holds
|
* The fault handler has no page to lock, so it holds
|
||||||
* i_mmap_lock for read to protect against truncate.
|
* i_mmap_lock for write to protect against truncate.
|
||||||
*/
|
*/
|
||||||
i_mmap_unlock_read(vma->vm_file->f_mapping);
|
i_mmap_unlock_write(vma->vm_file->f_mapping);
|
||||||
}
|
}
|
||||||
goto uncharge_out;
|
goto uncharge_out;
|
||||||
}
|
}
|
||||||
|
@ -3031,9 +3029,9 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||||
} else {
|
} else {
|
||||||
/*
|
/*
|
||||||
* The fault handler has no page to lock, so it holds
|
* The fault handler has no page to lock, so it holds
|
||||||
* i_mmap_lock for read to protect against truncate.
|
* i_mmap_lock for write to protect against truncate.
|
||||||
*/
|
*/
|
||||||
i_mmap_unlock_read(vma->vm_file->f_mapping);
|
i_mmap_unlock_write(vma->vm_file->f_mapping);
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
uncharge_out:
|
uncharge_out:
|
||||||
|
@ -3232,6 +3230,27 @@ out:
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int create_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||||
|
unsigned long address, pmd_t *pmd, unsigned int flags)
|
||||||
|
{
|
||||||
|
if (!vma->vm_ops)
|
||||||
|
return do_huge_pmd_anonymous_page(mm, vma, address, pmd, flags);
|
||||||
|
if (vma->vm_ops->pmd_fault)
|
||||||
|
return vma->vm_ops->pmd_fault(vma, address, pmd, flags);
|
||||||
|
return VM_FAULT_FALLBACK;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int wp_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||||
|
unsigned long address, pmd_t *pmd, pmd_t orig_pmd,
|
||||||
|
unsigned int flags)
|
||||||
|
{
|
||||||
|
if (!vma->vm_ops)
|
||||||
|
return do_huge_pmd_wp_page(mm, vma, address, pmd, orig_pmd);
|
||||||
|
if (vma->vm_ops->pmd_fault)
|
||||||
|
return vma->vm_ops->pmd_fault(vma, address, pmd, flags);
|
||||||
|
return VM_FAULT_FALLBACK;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* These routines also need to handle stuff like marking pages dirty
|
* These routines also need to handle stuff like marking pages dirty
|
||||||
* and/or accessed for architectures that don't do it in hardware (most
|
* and/or accessed for architectures that don't do it in hardware (most
|
||||||
|
@ -3267,12 +3286,12 @@ static int handle_pte_fault(struct mm_struct *mm,
|
||||||
barrier();
|
barrier();
|
||||||
if (!pte_present(entry)) {
|
if (!pte_present(entry)) {
|
||||||
if (pte_none(entry)) {
|
if (pte_none(entry)) {
|
||||||
if (vma->vm_ops)
|
if (vma_is_anonymous(vma))
|
||||||
|
return do_anonymous_page(mm, vma, address,
|
||||||
|
pte, pmd, flags);
|
||||||
|
else
|
||||||
return do_fault(mm, vma, address, pte, pmd,
|
return do_fault(mm, vma, address, pte, pmd,
|
||||||
flags, entry);
|
flags, entry);
|
||||||
|
|
||||||
return do_anonymous_page(mm, vma, address, pte, pmd,
|
|
||||||
flags);
|
|
||||||
}
|
}
|
||||||
return do_swap_page(mm, vma, address,
|
return do_swap_page(mm, vma, address,
|
||||||
pte, pmd, flags, entry);
|
pte, pmd, flags, entry);
|
||||||
|
@ -3334,10 +3353,7 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||||
if (!pmd)
|
if (!pmd)
|
||||||
return VM_FAULT_OOM;
|
return VM_FAULT_OOM;
|
||||||
if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
|
if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
|
||||||
int ret = VM_FAULT_FALLBACK;
|
int ret = create_huge_pmd(mm, vma, address, pmd, flags);
|
||||||
if (!vma->vm_ops)
|
|
||||||
ret = do_huge_pmd_anonymous_page(mm, vma, address,
|
|
||||||
pmd, flags);
|
|
||||||
if (!(ret & VM_FAULT_FALLBACK))
|
if (!(ret & VM_FAULT_FALLBACK))
|
||||||
return ret;
|
return ret;
|
||||||
} else {
|
} else {
|
||||||
|
@ -3361,8 +3377,8 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||||
orig_pmd, pmd);
|
orig_pmd, pmd);
|
||||||
|
|
||||||
if (dirty && !pmd_write(orig_pmd)) {
|
if (dirty && !pmd_write(orig_pmd)) {
|
||||||
ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
|
ret = wp_huge_pmd(mm, vma, address, pmd,
|
||||||
orig_pmd);
|
orig_pmd, flags);
|
||||||
if (!(ret & VM_FAULT_FALLBACK))
|
if (!(ret & VM_FAULT_FALLBACK))
|
||||||
return ret;
|
return ret;
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -608,9 +608,6 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
|
||||||
|
|
||||||
qp->prev = vma;
|
qp->prev = vma;
|
||||||
|
|
||||||
if (vma->vm_flags & VM_PFNMAP)
|
|
||||||
return 1;
|
|
||||||
|
|
||||||
if (flags & MPOL_MF_LAZY) {
|
if (flags & MPOL_MF_LAZY) {
|
||||||
/* Similar to task_numa_work, skip inaccessible VMAs */
|
/* Similar to task_numa_work, skip inaccessible VMAs */
|
||||||
if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
|
if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
|
||||||
|
@ -945,7 +942,7 @@ static struct page *new_node_page(struct page *page, unsigned long node, int **x
|
||||||
return alloc_huge_page_node(page_hstate(compound_head(page)),
|
return alloc_huge_page_node(page_hstate(compound_head(page)),
|
||||||
node);
|
node);
|
||||||
else
|
else
|
||||||
return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE |
|
return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
|
||||||
__GFP_THISNODE, 0);
|
__GFP_THISNODE, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2001,7 +1998,7 @@ retry_cpuset:
|
||||||
nmask = policy_nodemask(gfp, pol);
|
nmask = policy_nodemask(gfp, pol);
|
||||||
if (!nmask || node_isset(hpage_node, *nmask)) {
|
if (!nmask || node_isset(hpage_node, *nmask)) {
|
||||||
mpol_cond_put(pol);
|
mpol_cond_put(pol);
|
||||||
page = alloc_pages_exact_node(hpage_node,
|
page = __alloc_pages_node(hpage_node,
|
||||||
gfp | __GFP_THISNODE, order);
|
gfp | __GFP_THISNODE, order);
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
|
@ -150,6 +150,9 @@ static void *remove_element(mempool_t *pool)
|
||||||
*/
|
*/
|
||||||
void mempool_destroy(mempool_t *pool)
|
void mempool_destroy(mempool_t *pool)
|
||||||
{
|
{
|
||||||
|
if (unlikely(!pool))
|
||||||
|
return;
|
||||||
|
|
||||||
while (pool->curr_nr) {
|
while (pool->curr_nr) {
|
||||||
void *element = remove_element(pool);
|
void *element = remove_element(pool);
|
||||||
pool->free(element, pool->pool_data);
|
pool->free(element, pool->pool_data);
|
||||||
|
|
27
mm/memtest.c
27
mm/memtest.c
|
@ -1,11 +1,6 @@
|
||||||
#include <linux/kernel.h>
|
#include <linux/kernel.h>
|
||||||
#include <linux/errno.h>
|
|
||||||
#include <linux/string.h>
|
|
||||||
#include <linux/types.h>
|
#include <linux/types.h>
|
||||||
#include <linux/mm.h>
|
|
||||||
#include <linux/smp.h>
|
|
||||||
#include <linux/init.h>
|
#include <linux/init.h>
|
||||||
#include <linux/pfn.h>
|
|
||||||
#include <linux/memblock.h>
|
#include <linux/memblock.h>
|
||||||
|
|
||||||
static u64 patterns[] __initdata = {
|
static u64 patterns[] __initdata = {
|
||||||
|
@ -31,10 +26,8 @@ static u64 patterns[] __initdata = {
|
||||||
|
|
||||||
static void __init reserve_bad_mem(u64 pattern, phys_addr_t start_bad, phys_addr_t end_bad)
|
static void __init reserve_bad_mem(u64 pattern, phys_addr_t start_bad, phys_addr_t end_bad)
|
||||||
{
|
{
|
||||||
printk(KERN_INFO " %016llx bad mem addr %010llx - %010llx reserved\n",
|
pr_info(" %016llx bad mem addr %pa - %pa reserved\n",
|
||||||
(unsigned long long) pattern,
|
cpu_to_be64(pattern), &start_bad, &end_bad);
|
||||||
(unsigned long long) start_bad,
|
|
||||||
(unsigned long long) end_bad);
|
|
||||||
memblock_reserve(start_bad, end_bad - start_bad);
|
memblock_reserve(start_bad, end_bad - start_bad);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -79,26 +72,26 @@ static void __init do_one_pass(u64 pattern, phys_addr_t start, phys_addr_t end)
|
||||||
this_start = clamp(this_start, start, end);
|
this_start = clamp(this_start, start, end);
|
||||||
this_end = clamp(this_end, start, end);
|
this_end = clamp(this_end, start, end);
|
||||||
if (this_start < this_end) {
|
if (this_start < this_end) {
|
||||||
printk(KERN_INFO " %010llx - %010llx pattern %016llx\n",
|
pr_info(" %pa - %pa pattern %016llx\n",
|
||||||
(unsigned long long)this_start,
|
&this_start, &this_end, cpu_to_be64(pattern));
|
||||||
(unsigned long long)this_end,
|
|
||||||
(unsigned long long)cpu_to_be64(pattern));
|
|
||||||
memtest(pattern, this_start, this_end - this_start);
|
memtest(pattern, this_start, this_end - this_start);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* default is disabled */
|
/* default is disabled */
|
||||||
static int memtest_pattern __initdata;
|
static unsigned int memtest_pattern __initdata;
|
||||||
|
|
||||||
static int __init parse_memtest(char *arg)
|
static int __init parse_memtest(char *arg)
|
||||||
{
|
{
|
||||||
|
int ret = 0;
|
||||||
|
|
||||||
if (arg)
|
if (arg)
|
||||||
memtest_pattern = simple_strtoul(arg, NULL, 0);
|
ret = kstrtouint(arg, 0, &memtest_pattern);
|
||||||
else
|
else
|
||||||
memtest_pattern = ARRAY_SIZE(patterns);
|
memtest_pattern = ARRAY_SIZE(patterns);
|
||||||
|
|
||||||
return 0;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
early_param("memtest", parse_memtest);
|
early_param("memtest", parse_memtest);
|
||||||
|
@ -111,7 +104,7 @@ void __init early_memtest(phys_addr_t start, phys_addr_t end)
|
||||||
if (!memtest_pattern)
|
if (!memtest_pattern)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
printk(KERN_INFO "early_memtest: # of tests: %d\n", memtest_pattern);
|
pr_info("early_memtest: # of tests: %u\n", memtest_pattern);
|
||||||
for (i = memtest_pattern-1; i < UINT_MAX; --i) {
|
for (i = memtest_pattern-1; i < UINT_MAX; --i) {
|
||||||
idx = i % ARRAY_SIZE(patterns);
|
idx = i % ARRAY_SIZE(patterns);
|
||||||
do_one_pass(patterns[idx], start, end);
|
do_one_pass(patterns[idx], start, end);
|
||||||
|
|
13
mm/migrate.c
13
mm/migrate.c
|
@ -880,8 +880,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
|
||||||
/* Establish migration ptes or remove ptes */
|
/* Establish migration ptes or remove ptes */
|
||||||
if (page_mapped(page)) {
|
if (page_mapped(page)) {
|
||||||
try_to_unmap(page,
|
try_to_unmap(page,
|
||||||
TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS|
|
TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
|
||||||
TTU_IGNORE_HWPOISON);
|
|
||||||
page_was_mapped = 1;
|
page_was_mapped = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -952,9 +951,11 @@ out:
|
||||||
dec_zone_page_state(page, NR_ISOLATED_ANON +
|
dec_zone_page_state(page, NR_ISOLATED_ANON +
|
||||||
page_is_file_cache(page));
|
page_is_file_cache(page));
|
||||||
/* Soft-offlined page shouldn't go through lru cache list */
|
/* Soft-offlined page shouldn't go through lru cache list */
|
||||||
if (reason == MR_MEMORY_FAILURE)
|
if (reason == MR_MEMORY_FAILURE) {
|
||||||
put_page(page);
|
put_page(page);
|
||||||
else
|
if (!test_set_page_hwpoison(page))
|
||||||
|
num_poisoned_pages_inc();
|
||||||
|
} else
|
||||||
putback_lru_page(page);
|
putback_lru_page(page);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1194,7 +1195,7 @@ static struct page *new_page_node(struct page *p, unsigned long private,
|
||||||
return alloc_huge_page_node(page_hstate(compound_head(p)),
|
return alloc_huge_page_node(page_hstate(compound_head(p)),
|
||||||
pm->node);
|
pm->node);
|
||||||
else
|
else
|
||||||
return alloc_pages_exact_node(pm->node,
|
return __alloc_pages_node(pm->node,
|
||||||
GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 0);
|
GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1554,7 +1555,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
|
||||||
int nid = (int) data;
|
int nid = (int) data;
|
||||||
struct page *newpage;
|
struct page *newpage;
|
||||||
|
|
||||||
newpage = alloc_pages_exact_node(nid,
|
newpage = __alloc_pages_node(nid,
|
||||||
(GFP_HIGHUSER_MOVABLE |
|
(GFP_HIGHUSER_MOVABLE |
|
||||||
__GFP_THISNODE | __GFP_NOMEMALLOC |
|
__GFP_THISNODE | __GFP_NOMEMALLOC |
|
||||||
__GFP_NORETRY | __GFP_NOWARN) &
|
__GFP_NORETRY | __GFP_NOWARN) &
|
||||||
|
|
71
mm/mmap.c
71
mm/mmap.c
|
@ -2455,7 +2455,7 @@ static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||||
unsigned long addr, int new_below)
|
unsigned long addr, int new_below)
|
||||||
{
|
{
|
||||||
struct vm_area_struct *new;
|
struct vm_area_struct *new;
|
||||||
int err = -ENOMEM;
|
int err;
|
||||||
|
|
||||||
if (is_vm_hugetlb_page(vma) && (addr &
|
if (is_vm_hugetlb_page(vma) && (addr &
|
||||||
~(huge_page_mask(hstate_vma(vma)))))
|
~(huge_page_mask(hstate_vma(vma)))))
|
||||||
|
@ -2463,7 +2463,7 @@ static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||||
|
|
||||||
new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
|
new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
|
||||||
if (!new)
|
if (!new)
|
||||||
goto out_err;
|
return -ENOMEM;
|
||||||
|
|
||||||
/* most fields are the same, copy all, and then fixup */
|
/* most fields are the same, copy all, and then fixup */
|
||||||
*new = *vma;
|
*new = *vma;
|
||||||
|
@ -2511,7 +2511,6 @@ static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||||
mpol_put(vma_policy(new));
|
mpol_put(vma_policy(new));
|
||||||
out_free_vma:
|
out_free_vma:
|
||||||
kmem_cache_free(vm_area_cachep, new);
|
kmem_cache_free(vm_area_cachep, new);
|
||||||
out_err:
|
|
||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2872,6 +2871,13 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
|
||||||
struct vm_area_struct *prev;
|
struct vm_area_struct *prev;
|
||||||
struct rb_node **rb_link, *rb_parent;
|
struct rb_node **rb_link, *rb_parent;
|
||||||
|
|
||||||
|
if (find_vma_links(mm, vma->vm_start, vma->vm_end,
|
||||||
|
&prev, &rb_link, &rb_parent))
|
||||||
|
return -ENOMEM;
|
||||||
|
if ((vma->vm_flags & VM_ACCOUNT) &&
|
||||||
|
security_vm_enough_memory_mm(mm, vma_pages(vma)))
|
||||||
|
return -ENOMEM;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The vm_pgoff of a purely anonymous vma should be irrelevant
|
* The vm_pgoff of a purely anonymous vma should be irrelevant
|
||||||
* until its first write fault, when page's anon_vma and index
|
* until its first write fault, when page's anon_vma and index
|
||||||
|
@ -2884,16 +2890,10 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
|
||||||
* using the existing file pgoff checks and manipulations.
|
* using the existing file pgoff checks and manipulations.
|
||||||
* Similarly in do_mmap_pgoff and in do_brk.
|
* Similarly in do_mmap_pgoff and in do_brk.
|
||||||
*/
|
*/
|
||||||
if (!vma->vm_file) {
|
if (vma_is_anonymous(vma)) {
|
||||||
BUG_ON(vma->anon_vma);
|
BUG_ON(vma->anon_vma);
|
||||||
vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
|
vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
|
||||||
}
|
}
|
||||||
if (find_vma_links(mm, vma->vm_start, vma->vm_end,
|
|
||||||
&prev, &rb_link, &rb_parent))
|
|
||||||
return -ENOMEM;
|
|
||||||
if ((vma->vm_flags & VM_ACCOUNT) &&
|
|
||||||
security_vm_enough_memory_mm(mm, vma_pages(vma)))
|
|
||||||
return -ENOMEM;
|
|
||||||
|
|
||||||
vma_link(mm, vma, prev, rb_link, rb_parent);
|
vma_link(mm, vma, prev, rb_link, rb_parent);
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -2918,7 +2918,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
|
||||||
* If anonymous vma has not yet been faulted, update new pgoff
|
* If anonymous vma has not yet been faulted, update new pgoff
|
||||||
* to match new location, to increase its chance of merging.
|
* to match new location, to increase its chance of merging.
|
||||||
*/
|
*/
|
||||||
if (unlikely(!vma->vm_file && !vma->anon_vma)) {
|
if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) {
|
||||||
pgoff = addr >> PAGE_SHIFT;
|
pgoff = addr >> PAGE_SHIFT;
|
||||||
faulted_in_anon_vma = false;
|
faulted_in_anon_vma = false;
|
||||||
}
|
}
|
||||||
|
@ -2952,30 +2952,31 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
|
||||||
*need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
|
*need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
|
||||||
} else {
|
} else {
|
||||||
new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
|
new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
|
||||||
if (new_vma) {
|
if (!new_vma)
|
||||||
*new_vma = *vma;
|
goto out;
|
||||||
new_vma->vm_start = addr;
|
*new_vma = *vma;
|
||||||
new_vma->vm_end = addr + len;
|
new_vma->vm_start = addr;
|
||||||
new_vma->vm_pgoff = pgoff;
|
new_vma->vm_end = addr + len;
|
||||||
if (vma_dup_policy(vma, new_vma))
|
new_vma->vm_pgoff = pgoff;
|
||||||
goto out_free_vma;
|
if (vma_dup_policy(vma, new_vma))
|
||||||
INIT_LIST_HEAD(&new_vma->anon_vma_chain);
|
goto out_free_vma;
|
||||||
if (anon_vma_clone(new_vma, vma))
|
INIT_LIST_HEAD(&new_vma->anon_vma_chain);
|
||||||
goto out_free_mempol;
|
if (anon_vma_clone(new_vma, vma))
|
||||||
if (new_vma->vm_file)
|
goto out_free_mempol;
|
||||||
get_file(new_vma->vm_file);
|
if (new_vma->vm_file)
|
||||||
if (new_vma->vm_ops && new_vma->vm_ops->open)
|
get_file(new_vma->vm_file);
|
||||||
new_vma->vm_ops->open(new_vma);
|
if (new_vma->vm_ops && new_vma->vm_ops->open)
|
||||||
vma_link(mm, new_vma, prev, rb_link, rb_parent);
|
new_vma->vm_ops->open(new_vma);
|
||||||
*need_rmap_locks = false;
|
vma_link(mm, new_vma, prev, rb_link, rb_parent);
|
||||||
}
|
*need_rmap_locks = false;
|
||||||
}
|
}
|
||||||
return new_vma;
|
return new_vma;
|
||||||
|
|
||||||
out_free_mempol:
|
out_free_mempol:
|
||||||
mpol_put(vma_policy(new_vma));
|
mpol_put(vma_policy(new_vma));
|
||||||
out_free_vma:
|
out_free_vma:
|
||||||
kmem_cache_free(vm_area_cachep, new_vma);
|
kmem_cache_free(vm_area_cachep, new_vma);
|
||||||
|
out:
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3027,21 +3028,13 @@ static int special_mapping_fault(struct vm_area_struct *vma,
|
||||||
pgoff_t pgoff;
|
pgoff_t pgoff;
|
||||||
struct page **pages;
|
struct page **pages;
|
||||||
|
|
||||||
/*
|
|
||||||
* special mappings have no vm_file, and in that case, the mm
|
|
||||||
* uses vm_pgoff internally. So we have to subtract it from here.
|
|
||||||
* We are allowed to do this because we are the mm; do not copy
|
|
||||||
* this code into drivers!
|
|
||||||
*/
|
|
||||||
pgoff = vmf->pgoff - vma->vm_pgoff;
|
|
||||||
|
|
||||||
if (vma->vm_ops == &legacy_special_mapping_vmops)
|
if (vma->vm_ops == &legacy_special_mapping_vmops)
|
||||||
pages = vma->vm_private_data;
|
pages = vma->vm_private_data;
|
||||||
else
|
else
|
||||||
pages = ((struct vm_special_mapping *)vma->vm_private_data)->
|
pages = ((struct vm_special_mapping *)vma->vm_private_data)->
|
||||||
pages;
|
pages;
|
||||||
|
|
||||||
for (; pgoff && *pages; ++pages)
|
for (pgoff = vmf->pgoff; pgoff && *pages; ++pages)
|
||||||
pgoff--;
|
pgoff--;
|
||||||
|
|
||||||
if (*pages) {
|
if (*pages) {
|
||||||
|
|
142
mm/oom_kill.c
142
mm/oom_kill.c
|
@ -196,27 +196,26 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
|
||||||
* Determine the type of allocation constraint.
|
* Determine the type of allocation constraint.
|
||||||
*/
|
*/
|
||||||
#ifdef CONFIG_NUMA
|
#ifdef CONFIG_NUMA
|
||||||
static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
|
static enum oom_constraint constrained_alloc(struct oom_control *oc,
|
||||||
gfp_t gfp_mask, nodemask_t *nodemask,
|
unsigned long *totalpages)
|
||||||
unsigned long *totalpages)
|
|
||||||
{
|
{
|
||||||
struct zone *zone;
|
struct zone *zone;
|
||||||
struct zoneref *z;
|
struct zoneref *z;
|
||||||
enum zone_type high_zoneidx = gfp_zone(gfp_mask);
|
enum zone_type high_zoneidx = gfp_zone(oc->gfp_mask);
|
||||||
bool cpuset_limited = false;
|
bool cpuset_limited = false;
|
||||||
int nid;
|
int nid;
|
||||||
|
|
||||||
/* Default to all available memory */
|
/* Default to all available memory */
|
||||||
*totalpages = totalram_pages + total_swap_pages;
|
*totalpages = totalram_pages + total_swap_pages;
|
||||||
|
|
||||||
if (!zonelist)
|
if (!oc->zonelist)
|
||||||
return CONSTRAINT_NONE;
|
return CONSTRAINT_NONE;
|
||||||
/*
|
/*
|
||||||
* Reach here only when __GFP_NOFAIL is used. So, we should avoid
|
* Reach here only when __GFP_NOFAIL is used. So, we should avoid
|
||||||
* to kill current.We have to random task kill in this case.
|
* to kill current.We have to random task kill in this case.
|
||||||
* Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now.
|
* Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now.
|
||||||
*/
|
*/
|
||||||
if (gfp_mask & __GFP_THISNODE)
|
if (oc->gfp_mask & __GFP_THISNODE)
|
||||||
return CONSTRAINT_NONE;
|
return CONSTRAINT_NONE;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -224,17 +223,18 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
|
||||||
* the page allocator means a mempolicy is in effect. Cpuset policy
|
* the page allocator means a mempolicy is in effect. Cpuset policy
|
||||||
* is enforced in get_page_from_freelist().
|
* is enforced in get_page_from_freelist().
|
||||||
*/
|
*/
|
||||||
if (nodemask && !nodes_subset(node_states[N_MEMORY], *nodemask)) {
|
if (oc->nodemask &&
|
||||||
|
!nodes_subset(node_states[N_MEMORY], *oc->nodemask)) {
|
||||||
*totalpages = total_swap_pages;
|
*totalpages = total_swap_pages;
|
||||||
for_each_node_mask(nid, *nodemask)
|
for_each_node_mask(nid, *oc->nodemask)
|
||||||
*totalpages += node_spanned_pages(nid);
|
*totalpages += node_spanned_pages(nid);
|
||||||
return CONSTRAINT_MEMORY_POLICY;
|
return CONSTRAINT_MEMORY_POLICY;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Check this allocation failure is caused by cpuset's wall function */
|
/* Check this allocation failure is caused by cpuset's wall function */
|
||||||
for_each_zone_zonelist_nodemask(zone, z, zonelist,
|
for_each_zone_zonelist_nodemask(zone, z, oc->zonelist,
|
||||||
high_zoneidx, nodemask)
|
high_zoneidx, oc->nodemask)
|
||||||
if (!cpuset_zone_allowed(zone, gfp_mask))
|
if (!cpuset_zone_allowed(zone, oc->gfp_mask))
|
||||||
cpuset_limited = true;
|
cpuset_limited = true;
|
||||||
|
|
||||||
if (cpuset_limited) {
|
if (cpuset_limited) {
|
||||||
|
@ -246,20 +246,18 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
|
||||||
return CONSTRAINT_NONE;
|
return CONSTRAINT_NONE;
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
|
static enum oom_constraint constrained_alloc(struct oom_control *oc,
|
||||||
gfp_t gfp_mask, nodemask_t *nodemask,
|
unsigned long *totalpages)
|
||||||
unsigned long *totalpages)
|
|
||||||
{
|
{
|
||||||
*totalpages = totalram_pages + total_swap_pages;
|
*totalpages = totalram_pages + total_swap_pages;
|
||||||
return CONSTRAINT_NONE;
|
return CONSTRAINT_NONE;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
|
enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
|
||||||
unsigned long totalpages, const nodemask_t *nodemask,
|
struct task_struct *task, unsigned long totalpages)
|
||||||
bool force_kill)
|
|
||||||
{
|
{
|
||||||
if (oom_unkillable_task(task, NULL, nodemask))
|
if (oom_unkillable_task(task, NULL, oc->nodemask))
|
||||||
return OOM_SCAN_CONTINUE;
|
return OOM_SCAN_CONTINUE;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -267,7 +265,7 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
|
||||||
* Don't allow any other task to have access to the reserves.
|
* Don't allow any other task to have access to the reserves.
|
||||||
*/
|
*/
|
||||||
if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
|
if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
|
||||||
if (!force_kill)
|
if (oc->order != -1)
|
||||||
return OOM_SCAN_ABORT;
|
return OOM_SCAN_ABORT;
|
||||||
}
|
}
|
||||||
if (!task->mm)
|
if (!task->mm)
|
||||||
|
@ -280,7 +278,7 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
|
||||||
if (oom_task_origin(task))
|
if (oom_task_origin(task))
|
||||||
return OOM_SCAN_SELECT;
|
return OOM_SCAN_SELECT;
|
||||||
|
|
||||||
if (task_will_free_mem(task) && !force_kill)
|
if (task_will_free_mem(task) && oc->order != -1)
|
||||||
return OOM_SCAN_ABORT;
|
return OOM_SCAN_ABORT;
|
||||||
|
|
||||||
return OOM_SCAN_OK;
|
return OOM_SCAN_OK;
|
||||||
|
@ -289,12 +287,9 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
|
||||||
/*
|
/*
|
||||||
* Simple selection loop. We chose the process with the highest
|
* Simple selection loop. We chose the process with the highest
|
||||||
* number of 'points'. Returns -1 on scan abort.
|
* number of 'points'. Returns -1 on scan abort.
|
||||||
*
|
|
||||||
* (not docbooked, we don't want this one cluttering up the manual)
|
|
||||||
*/
|
*/
|
||||||
static struct task_struct *select_bad_process(unsigned int *ppoints,
|
static struct task_struct *select_bad_process(struct oom_control *oc,
|
||||||
unsigned long totalpages, const nodemask_t *nodemask,
|
unsigned int *ppoints, unsigned long totalpages)
|
||||||
bool force_kill)
|
|
||||||
{
|
{
|
||||||
struct task_struct *g, *p;
|
struct task_struct *g, *p;
|
||||||
struct task_struct *chosen = NULL;
|
struct task_struct *chosen = NULL;
|
||||||
|
@ -304,8 +299,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
|
||||||
for_each_process_thread(g, p) {
|
for_each_process_thread(g, p) {
|
||||||
unsigned int points;
|
unsigned int points;
|
||||||
|
|
||||||
switch (oom_scan_process_thread(p, totalpages, nodemask,
|
switch (oom_scan_process_thread(oc, p, totalpages)) {
|
||||||
force_kill)) {
|
|
||||||
case OOM_SCAN_SELECT:
|
case OOM_SCAN_SELECT:
|
||||||
chosen = p;
|
chosen = p;
|
||||||
chosen_points = ULONG_MAX;
|
chosen_points = ULONG_MAX;
|
||||||
|
@ -318,7 +312,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
|
||||||
case OOM_SCAN_OK:
|
case OOM_SCAN_OK:
|
||||||
break;
|
break;
|
||||||
};
|
};
|
||||||
points = oom_badness(p, NULL, nodemask, totalpages);
|
points = oom_badness(p, NULL, oc->nodemask, totalpages);
|
||||||
if (!points || points < chosen_points)
|
if (!points || points < chosen_points)
|
||||||
continue;
|
continue;
|
||||||
/* Prefer thread group leaders for display purposes */
|
/* Prefer thread group leaders for display purposes */
|
||||||
|
@ -380,13 +374,13 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
|
||||||
rcu_read_unlock();
|
rcu_read_unlock();
|
||||||
}
|
}
|
||||||
|
|
||||||
static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
|
static void dump_header(struct oom_control *oc, struct task_struct *p,
|
||||||
struct mem_cgroup *memcg, const nodemask_t *nodemask)
|
struct mem_cgroup *memcg)
|
||||||
{
|
{
|
||||||
task_lock(current);
|
task_lock(current);
|
||||||
pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
|
pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
|
||||||
"oom_score_adj=%hd\n",
|
"oom_score_adj=%hd\n",
|
||||||
current->comm, gfp_mask, order,
|
current->comm, oc->gfp_mask, oc->order,
|
||||||
current->signal->oom_score_adj);
|
current->signal->oom_score_adj);
|
||||||
cpuset_print_task_mems_allowed(current);
|
cpuset_print_task_mems_allowed(current);
|
||||||
task_unlock(current);
|
task_unlock(current);
|
||||||
|
@ -396,7 +390,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
|
||||||
else
|
else
|
||||||
show_mem(SHOW_MEM_FILTER_NODES);
|
show_mem(SHOW_MEM_FILTER_NODES);
|
||||||
if (sysctl_oom_dump_tasks)
|
if (sysctl_oom_dump_tasks)
|
||||||
dump_tasks(memcg, nodemask);
|
dump_tasks(memcg, oc->nodemask);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -487,10 +481,9 @@ void oom_killer_enable(void)
|
||||||
* Must be called while holding a reference to p, which will be released upon
|
* Must be called while holding a reference to p, which will be released upon
|
||||||
* returning.
|
* returning.
|
||||||
*/
|
*/
|
||||||
void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
|
void oom_kill_process(struct oom_control *oc, struct task_struct *p,
|
||||||
unsigned int points, unsigned long totalpages,
|
unsigned int points, unsigned long totalpages,
|
||||||
struct mem_cgroup *memcg, nodemask_t *nodemask,
|
struct mem_cgroup *memcg, const char *message)
|
||||||
const char *message)
|
|
||||||
{
|
{
|
||||||
struct task_struct *victim = p;
|
struct task_struct *victim = p;
|
||||||
struct task_struct *child;
|
struct task_struct *child;
|
||||||
|
@ -514,7 +507,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
|
||||||
task_unlock(p);
|
task_unlock(p);
|
||||||
|
|
||||||
if (__ratelimit(&oom_rs))
|
if (__ratelimit(&oom_rs))
|
||||||
dump_header(p, gfp_mask, order, memcg, nodemask);
|
dump_header(oc, p, memcg);
|
||||||
|
|
||||||
task_lock(p);
|
task_lock(p);
|
||||||
pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
|
pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
|
||||||
|
@ -537,7 +530,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
|
||||||
/*
|
/*
|
||||||
* oom_badness() returns 0 if the thread is unkillable
|
* oom_badness() returns 0 if the thread is unkillable
|
||||||
*/
|
*/
|
||||||
child_points = oom_badness(child, memcg, nodemask,
|
child_points = oom_badness(child, memcg, oc->nodemask,
|
||||||
totalpages);
|
totalpages);
|
||||||
if (child_points > victim_points) {
|
if (child_points > victim_points) {
|
||||||
put_task_struct(victim);
|
put_task_struct(victim);
|
||||||
|
@ -600,8 +593,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
|
||||||
/*
|
/*
|
||||||
* Determines whether the kernel must panic because of the panic_on_oom sysctl.
|
* Determines whether the kernel must panic because of the panic_on_oom sysctl.
|
||||||
*/
|
*/
|
||||||
void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
|
void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint,
|
||||||
int order, const nodemask_t *nodemask,
|
|
||||||
struct mem_cgroup *memcg)
|
struct mem_cgroup *memcg)
|
||||||
{
|
{
|
||||||
if (likely(!sysctl_panic_on_oom))
|
if (likely(!sysctl_panic_on_oom))
|
||||||
|
@ -615,7 +607,10 @@ void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
|
||||||
if (constraint != CONSTRAINT_NONE)
|
if (constraint != CONSTRAINT_NONE)
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
dump_header(NULL, gfp_mask, order, memcg, nodemask);
|
/* Do not panic for oom kills triggered by sysrq */
|
||||||
|
if (oc->order == -1)
|
||||||
|
return;
|
||||||
|
dump_header(oc, NULL, memcg);
|
||||||
panic("Out of memory: %s panic_on_oom is enabled\n",
|
panic("Out of memory: %s panic_on_oom is enabled\n",
|
||||||
sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
|
sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
|
||||||
}
|
}
|
||||||
|
@ -635,28 +630,21 @@ int unregister_oom_notifier(struct notifier_block *nb)
|
||||||
EXPORT_SYMBOL_GPL(unregister_oom_notifier);
|
EXPORT_SYMBOL_GPL(unregister_oom_notifier);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* __out_of_memory - kill the "best" process when we run out of memory
|
* out_of_memory - kill the "best" process when we run out of memory
|
||||||
* @zonelist: zonelist pointer
|
* @oc: pointer to struct oom_control
|
||||||
* @gfp_mask: memory allocation flags
|
|
||||||
* @order: amount of memory being requested as a power of 2
|
|
||||||
* @nodemask: nodemask passed to page allocator
|
|
||||||
* @force_kill: true if a task must be killed, even if others are exiting
|
|
||||||
*
|
*
|
||||||
* If we run out of memory, we have the choice between either
|
* If we run out of memory, we have the choice between either
|
||||||
* killing a random task (bad), letting the system crash (worse)
|
* killing a random task (bad), letting the system crash (worse)
|
||||||
* OR try to be smart about which process to kill. Note that we
|
* OR try to be smart about which process to kill. Note that we
|
||||||
* don't have to be perfect here, we just have to be good.
|
* don't have to be perfect here, we just have to be good.
|
||||||
*/
|
*/
|
||||||
bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
|
bool out_of_memory(struct oom_control *oc)
|
||||||
int order, nodemask_t *nodemask, bool force_kill)
|
|
||||||
{
|
{
|
||||||
const nodemask_t *mpol_mask;
|
|
||||||
struct task_struct *p;
|
struct task_struct *p;
|
||||||
unsigned long totalpages;
|
unsigned long totalpages;
|
||||||
unsigned long freed = 0;
|
unsigned long freed = 0;
|
||||||
unsigned int uninitialized_var(points);
|
unsigned int uninitialized_var(points);
|
||||||
enum oom_constraint constraint = CONSTRAINT_NONE;
|
enum oom_constraint constraint = CONSTRAINT_NONE;
|
||||||
int killed = 0;
|
|
||||||
|
|
||||||
if (oom_killer_disabled)
|
if (oom_killer_disabled)
|
||||||
return false;
|
return false;
|
||||||
|
@ -664,7 +652,7 @@ bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
|
||||||
blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
|
blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
|
||||||
if (freed > 0)
|
if (freed > 0)
|
||||||
/* Got some memory back in the last second. */
|
/* Got some memory back in the last second. */
|
||||||
goto out;
|
return true;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If current has a pending SIGKILL or is exiting, then automatically
|
* If current has a pending SIGKILL or is exiting, then automatically
|
||||||
|
@ -677,47 +665,42 @@ bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
|
||||||
if (current->mm &&
|
if (current->mm &&
|
||||||
(fatal_signal_pending(current) || task_will_free_mem(current))) {
|
(fatal_signal_pending(current) || task_will_free_mem(current))) {
|
||||||
mark_oom_victim(current);
|
mark_oom_victim(current);
|
||||||
goto out;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Check if there were limitations on the allocation (only relevant for
|
* Check if there were limitations on the allocation (only relevant for
|
||||||
* NUMA) that may require different handling.
|
* NUMA) that may require different handling.
|
||||||
*/
|
*/
|
||||||
constraint = constrained_alloc(zonelist, gfp_mask, nodemask,
|
constraint = constrained_alloc(oc, &totalpages);
|
||||||
&totalpages);
|
if (constraint != CONSTRAINT_MEMORY_POLICY)
|
||||||
mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL;
|
oc->nodemask = NULL;
|
||||||
check_panic_on_oom(constraint, gfp_mask, order, mpol_mask, NULL);
|
check_panic_on_oom(oc, constraint, NULL);
|
||||||
|
|
||||||
if (sysctl_oom_kill_allocating_task && current->mm &&
|
if (sysctl_oom_kill_allocating_task && current->mm &&
|
||||||
!oom_unkillable_task(current, NULL, nodemask) &&
|
!oom_unkillable_task(current, NULL, oc->nodemask) &&
|
||||||
current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
|
current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
|
||||||
get_task_struct(current);
|
get_task_struct(current);
|
||||||
oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL,
|
oom_kill_process(oc, current, 0, totalpages, NULL,
|
||||||
nodemask,
|
|
||||||
"Out of memory (oom_kill_allocating_task)");
|
"Out of memory (oom_kill_allocating_task)");
|
||||||
goto out;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
p = select_bad_process(&points, totalpages, mpol_mask, force_kill);
|
p = select_bad_process(oc, &points, totalpages);
|
||||||
/* Found nothing?!?! Either we hang forever, or we panic. */
|
/* Found nothing?!?! Either we hang forever, or we panic. */
|
||||||
if (!p) {
|
if (!p && oc->order != -1) {
|
||||||
dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
|
dump_header(oc, NULL, NULL);
|
||||||
panic("Out of memory and no killable processes...\n");
|
panic("Out of memory and no killable processes...\n");
|
||||||
}
|
}
|
||||||
if (p != (void *)-1UL) {
|
if (p && p != (void *)-1UL) {
|
||||||
oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
|
oom_kill_process(oc, p, points, totalpages, NULL,
|
||||||
nodemask, "Out of memory");
|
"Out of memory");
|
||||||
killed = 1;
|
/*
|
||||||
}
|
* Give the killed process a good chance to exit before trying
|
||||||
out:
|
* to allocate memory again.
|
||||||
/*
|
*/
|
||||||
* Give the killed threads a good chance of exiting before trying to
|
|
||||||
* allocate memory again.
|
|
||||||
*/
|
|
||||||
if (killed)
|
|
||||||
schedule_timeout_killable(1);
|
schedule_timeout_killable(1);
|
||||||
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -728,13 +711,20 @@ out:
|
||||||
*/
|
*/
|
||||||
void pagefault_out_of_memory(void)
|
void pagefault_out_of_memory(void)
|
||||||
{
|
{
|
||||||
|
struct oom_control oc = {
|
||||||
|
.zonelist = NULL,
|
||||||
|
.nodemask = NULL,
|
||||||
|
.gfp_mask = 0,
|
||||||
|
.order = 0,
|
||||||
|
};
|
||||||
|
|
||||||
if (mem_cgroup_oom_synchronize(true))
|
if (mem_cgroup_oom_synchronize(true))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
if (!mutex_trylock(&oom_lock))
|
if (!mutex_trylock(&oom_lock))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
if (!out_of_memory(NULL, 0, 0, NULL, false)) {
|
if (!out_of_memory(&oc)) {
|
||||||
/*
|
/*
|
||||||
* There shouldn't be any user tasks runnable while the
|
* There shouldn't be any user tasks runnable while the
|
||||||
* OOM killer is disabled, so the current task has to
|
* OOM killer is disabled, so the current task has to
|
||||||
|
|
|
@ -125,6 +125,24 @@ unsigned long dirty_balance_reserve __read_mostly;
|
||||||
int percpu_pagelist_fraction;
|
int percpu_pagelist_fraction;
|
||||||
gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
|
gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* A cached value of the page's pageblock's migratetype, used when the page is
|
||||||
|
* put on a pcplist. Used to avoid the pageblock migratetype lookup when
|
||||||
|
* freeing from pcplists in most cases, at the cost of possibly becoming stale.
|
||||||
|
* Also the migratetype set in the page does not necessarily match the pcplist
|
||||||
|
* index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any
|
||||||
|
* other index - this ensures that it will be put on the correct CMA freelist.
|
||||||
|
*/
|
||||||
|
static inline int get_pcppage_migratetype(struct page *page)
|
||||||
|
{
|
||||||
|
return page->index;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void set_pcppage_migratetype(struct page *page, int migratetype)
|
||||||
|
{
|
||||||
|
page->index = migratetype;
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef CONFIG_PM_SLEEP
|
#ifdef CONFIG_PM_SLEEP
|
||||||
/*
|
/*
|
||||||
* The following functions are used by the suspend/hibernate code to temporarily
|
* The following functions are used by the suspend/hibernate code to temporarily
|
||||||
|
@ -791,7 +809,11 @@ static void free_pcppages_bulk(struct zone *zone, int count,
|
||||||
page = list_entry(list->prev, struct page, lru);
|
page = list_entry(list->prev, struct page, lru);
|
||||||
/* must delete as __free_one_page list manipulates */
|
/* must delete as __free_one_page list manipulates */
|
||||||
list_del(&page->lru);
|
list_del(&page->lru);
|
||||||
mt = get_freepage_migratetype(page);
|
|
||||||
|
mt = get_pcppage_migratetype(page);
|
||||||
|
/* MIGRATE_ISOLATE page should not go to pcplists */
|
||||||
|
VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
|
||||||
|
/* Pageblock could have been isolated meanwhile */
|
||||||
if (unlikely(has_isolate_pageblock(zone)))
|
if (unlikely(has_isolate_pageblock(zone)))
|
||||||
mt = get_pageblock_migratetype(page);
|
mt = get_pageblock_migratetype(page);
|
||||||
|
|
||||||
|
@ -955,7 +977,6 @@ static void __free_pages_ok(struct page *page, unsigned int order)
|
||||||
migratetype = get_pfnblock_migratetype(page, pfn);
|
migratetype = get_pfnblock_migratetype(page, pfn);
|
||||||
local_irq_save(flags);
|
local_irq_save(flags);
|
||||||
__count_vm_events(PGFREE, 1 << order);
|
__count_vm_events(PGFREE, 1 << order);
|
||||||
set_freepage_migratetype(page, migratetype);
|
|
||||||
free_one_page(page_zone(page), page, pfn, order, migratetype);
|
free_one_page(page_zone(page), page, pfn, order, migratetype);
|
||||||
local_irq_restore(flags);
|
local_irq_restore(flags);
|
||||||
}
|
}
|
||||||
|
@ -1383,7 +1404,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
|
||||||
rmv_page_order(page);
|
rmv_page_order(page);
|
||||||
area->nr_free--;
|
area->nr_free--;
|
||||||
expand(zone, page, order, current_order, area, migratetype);
|
expand(zone, page, order, current_order, area, migratetype);
|
||||||
set_freepage_migratetype(page, migratetype);
|
set_pcppage_migratetype(page, migratetype);
|
||||||
return page;
|
return page;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1460,7 +1481,6 @@ int move_freepages(struct zone *zone,
|
||||||
order = page_order(page);
|
order = page_order(page);
|
||||||
list_move(&page->lru,
|
list_move(&page->lru,
|
||||||
&zone->free_area[order].free_list[migratetype]);
|
&zone->free_area[order].free_list[migratetype]);
|
||||||
set_freepage_migratetype(page, migratetype);
|
|
||||||
page += 1 << order;
|
page += 1 << order;
|
||||||
pages_moved += 1 << order;
|
pages_moved += 1 << order;
|
||||||
}
|
}
|
||||||
|
@ -1630,14 +1650,13 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
|
||||||
expand(zone, page, order, current_order, area,
|
expand(zone, page, order, current_order, area,
|
||||||
start_migratetype);
|
start_migratetype);
|
||||||
/*
|
/*
|
||||||
* The freepage_migratetype may differ from pageblock's
|
* The pcppage_migratetype may differ from pageblock's
|
||||||
* migratetype depending on the decisions in
|
* migratetype depending on the decisions in
|
||||||
* try_to_steal_freepages(). This is OK as long as it
|
* find_suitable_fallback(). This is OK as long as it does not
|
||||||
* does not differ for MIGRATE_CMA pageblocks. For CMA
|
* differ for MIGRATE_CMA pageblocks. Those can be used as
|
||||||
* we need to make sure unallocated pages flushed from
|
* fallback only via special __rmqueue_cma_fallback() function
|
||||||
* pcp lists are returned to the correct freelist.
|
|
||||||
*/
|
*/
|
||||||
set_freepage_migratetype(page, start_migratetype);
|
set_pcppage_migratetype(page, start_migratetype);
|
||||||
|
|
||||||
trace_mm_page_alloc_extfrag(page, order, current_order,
|
trace_mm_page_alloc_extfrag(page, order, current_order,
|
||||||
start_migratetype, fallback_mt);
|
start_migratetype, fallback_mt);
|
||||||
|
@ -1713,7 +1732,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
|
||||||
else
|
else
|
||||||
list_add_tail(&page->lru, list);
|
list_add_tail(&page->lru, list);
|
||||||
list = &page->lru;
|
list = &page->lru;
|
||||||
if (is_migrate_cma(get_freepage_migratetype(page)))
|
if (is_migrate_cma(get_pcppage_migratetype(page)))
|
||||||
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
|
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
|
||||||
-(1 << order));
|
-(1 << order));
|
||||||
}
|
}
|
||||||
|
@ -1910,7 +1929,7 @@ void free_hot_cold_page(struct page *page, bool cold)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
migratetype = get_pfnblock_migratetype(page, pfn);
|
migratetype = get_pfnblock_migratetype(page, pfn);
|
||||||
set_freepage_migratetype(page, migratetype);
|
set_pcppage_migratetype(page, migratetype);
|
||||||
local_irq_save(flags);
|
local_irq_save(flags);
|
||||||
__count_vm_event(PGFREE);
|
__count_vm_event(PGFREE);
|
||||||
|
|
||||||
|
@ -2115,7 +2134,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
|
||||||
if (!page)
|
if (!page)
|
||||||
goto failed;
|
goto failed;
|
||||||
__mod_zone_freepage_state(zone, -(1 << order),
|
__mod_zone_freepage_state(zone, -(1 << order),
|
||||||
get_freepage_migratetype(page));
|
get_pcppage_migratetype(page));
|
||||||
}
|
}
|
||||||
|
|
||||||
__mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
|
__mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
|
||||||
|
@ -2696,6 +2715,12 @@ static inline struct page *
|
||||||
__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
|
__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
|
||||||
const struct alloc_context *ac, unsigned long *did_some_progress)
|
const struct alloc_context *ac, unsigned long *did_some_progress)
|
||||||
{
|
{
|
||||||
|
struct oom_control oc = {
|
||||||
|
.zonelist = ac->zonelist,
|
||||||
|
.nodemask = ac->nodemask,
|
||||||
|
.gfp_mask = gfp_mask,
|
||||||
|
.order = order,
|
||||||
|
};
|
||||||
struct page *page;
|
struct page *page;
|
||||||
|
|
||||||
*did_some_progress = 0;
|
*did_some_progress = 0;
|
||||||
|
@ -2747,8 +2772,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
/* Exhausted what can be done so it's blamo time */
|
/* Exhausted what can be done so it's blamo time */
|
||||||
if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false)
|
if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL))
|
||||||
|| WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL))
|
|
||||||
*did_some_progress = 1;
|
*did_some_progress = 1;
|
||||||
out:
|
out:
|
||||||
mutex_unlock(&oom_lock);
|
mutex_unlock(&oom_lock);
|
||||||
|
@ -3490,8 +3514,6 @@ EXPORT_SYMBOL(alloc_pages_exact);
|
||||||
*
|
*
|
||||||
* Like alloc_pages_exact(), but try to allocate on node nid first before falling
|
* Like alloc_pages_exact(), but try to allocate on node nid first before falling
|
||||||
* back.
|
* back.
|
||||||
* Note this is not alloc_pages_exact_node() which allocates on a specific node,
|
|
||||||
* but is not exact.
|
|
||||||
*/
|
*/
|
||||||
void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
|
void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
|
||||||
{
|
{
|
||||||
|
@ -5066,7 +5088,7 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,
|
||||||
{
|
{
|
||||||
unsigned long zone_start_pfn, zone_end_pfn;
|
unsigned long zone_start_pfn, zone_end_pfn;
|
||||||
|
|
||||||
/* When hotadd a new node, the node should be empty */
|
/* When hotadd a new node from cpu_up(), the node should be empty */
|
||||||
if (!node_start_pfn && !node_end_pfn)
|
if (!node_start_pfn && !node_end_pfn)
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
|
@ -5133,7 +5155,7 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
|
||||||
unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
|
unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
|
||||||
unsigned long zone_start_pfn, zone_end_pfn;
|
unsigned long zone_start_pfn, zone_end_pfn;
|
||||||
|
|
||||||
/* When hotadd a new node, the node should be empty */
|
/* When hotadd a new node from cpu_up(), the node should be empty */
|
||||||
if (!node_start_pfn && !node_end_pfn)
|
if (!node_start_pfn && !node_end_pfn)
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
|
@ -5306,8 +5328,7 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
|
||||||
*
|
*
|
||||||
* NOTE: pgdat should get zeroed by caller.
|
* NOTE: pgdat should get zeroed by caller.
|
||||||
*/
|
*/
|
||||||
static void __paginginit free_area_init_core(struct pglist_data *pgdat,
|
static void __paginginit free_area_init_core(struct pglist_data *pgdat)
|
||||||
unsigned long node_start_pfn, unsigned long node_end_pfn)
|
|
||||||
{
|
{
|
||||||
enum zone_type j;
|
enum zone_type j;
|
||||||
int nid = pgdat->node_id;
|
int nid = pgdat->node_id;
|
||||||
|
@ -5458,7 +5479,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
|
||||||
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
|
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
|
||||||
get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
|
get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
|
||||||
pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
|
pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
|
||||||
(u64)start_pfn << PAGE_SHIFT, ((u64)end_pfn << PAGE_SHIFT) - 1);
|
(u64)start_pfn << PAGE_SHIFT,
|
||||||
|
end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
|
||||||
#endif
|
#endif
|
||||||
calculate_node_totalpages(pgdat, start_pfn, end_pfn,
|
calculate_node_totalpages(pgdat, start_pfn, end_pfn,
|
||||||
zones_size, zholes_size);
|
zones_size, zholes_size);
|
||||||
|
@ -5470,7 +5492,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
|
||||||
(unsigned long)pgdat->node_mem_map);
|
(unsigned long)pgdat->node_mem_map);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
free_area_init_core(pgdat, start_pfn, end_pfn);
|
free_area_init_core(pgdat);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
|
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
|
||||||
|
@ -5481,11 +5503,9 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
|
||||||
*/
|
*/
|
||||||
void __init setup_nr_node_ids(void)
|
void __init setup_nr_node_ids(void)
|
||||||
{
|
{
|
||||||
unsigned int node;
|
unsigned int highest;
|
||||||
unsigned int highest = 0;
|
|
||||||
|
|
||||||
for_each_node_mask(node, node_possible_map)
|
highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES);
|
||||||
highest = node;
|
|
||||||
nr_node_ids = highest + 1;
|
nr_node_ids = highest + 1;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -6006,7 +6026,7 @@ void __init mem_init_print_info(const char *str)
|
||||||
* set_dma_reserve - set the specified number of pages reserved in the first zone
|
* set_dma_reserve - set the specified number of pages reserved in the first zone
|
||||||
* @new_dma_reserve: The number of pages to mark reserved
|
* @new_dma_reserve: The number of pages to mark reserved
|
||||||
*
|
*
|
||||||
* The per-cpu batchsize and zone watermarks are determined by present_pages.
|
* The per-cpu batchsize and zone watermarks are determined by managed_pages.
|
||||||
* In the DMA zone, a significant percentage may be consumed by kernel image
|
* In the DMA zone, a significant percentage may be consumed by kernel image
|
||||||
* and other unfreeable allocations which can skew the watermarks badly. This
|
* and other unfreeable allocations which can skew the watermarks badly. This
|
||||||
* function may optionally be used to account for unfreeable pages in the
|
* function may optionally be used to account for unfreeable pages in the
|
||||||
|
@ -6059,7 +6079,7 @@ void __init page_alloc_init(void)
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio
|
* calculate_totalreserve_pages - called when sysctl_lowmem_reserve_ratio
|
||||||
* or min_free_kbytes changes.
|
* or min_free_kbytes changes.
|
||||||
*/
|
*/
|
||||||
static void calculate_totalreserve_pages(void)
|
static void calculate_totalreserve_pages(void)
|
||||||
|
@ -6103,7 +6123,7 @@ static void calculate_totalreserve_pages(void)
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* setup_per_zone_lowmem_reserve - called whenever
|
* setup_per_zone_lowmem_reserve - called whenever
|
||||||
* sysctl_lower_zone_reserve_ratio changes. Ensures that each zone
|
* sysctl_lowmem_reserve_ratio changes. Ensures that each zone
|
||||||
* has a correct pages reserved value, so an adequate number of
|
* has a correct pages reserved value, so an adequate number of
|
||||||
* pages are left in the zone after a successful __alloc_pages().
|
* pages are left in the zone after a successful __alloc_pages().
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -9,7 +9,8 @@
|
||||||
#include <linux/hugetlb.h>
|
#include <linux/hugetlb.h>
|
||||||
#include "internal.h"
|
#include "internal.h"
|
||||||
|
|
||||||
int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages)
|
static int set_migratetype_isolate(struct page *page,
|
||||||
|
bool skip_hwpoisoned_pages)
|
||||||
{
|
{
|
||||||
struct zone *zone;
|
struct zone *zone;
|
||||||
unsigned long flags, pfn;
|
unsigned long flags, pfn;
|
||||||
|
@ -72,7 +73,7 @@ out:
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
void unset_migratetype_isolate(struct page *page, unsigned migratetype)
|
static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
|
||||||
{
|
{
|
||||||
struct zone *zone;
|
struct zone *zone;
|
||||||
unsigned long flags, nr_pages;
|
unsigned long flags, nr_pages;
|
||||||
|
@ -223,34 +224,16 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
page = pfn_to_page(pfn);
|
page = pfn_to_page(pfn);
|
||||||
if (PageBuddy(page)) {
|
if (PageBuddy(page))
|
||||||
/*
|
/*
|
||||||
* If race between isolatation and allocation happens,
|
* If the page is on a free list, it has to be on
|
||||||
* some free pages could be in MIGRATE_MOVABLE list
|
* the correct MIGRATE_ISOLATE freelist. There is no
|
||||||
* although pageblock's migratation type of the page
|
* simple way to verify that as VM_BUG_ON(), though.
|
||||||
* is MIGRATE_ISOLATE. Catch it and move the page into
|
|
||||||
* MIGRATE_ISOLATE list.
|
|
||||||
*/
|
*/
|
||||||
if (get_freepage_migratetype(page) != MIGRATE_ISOLATE) {
|
|
||||||
struct page *end_page;
|
|
||||||
|
|
||||||
end_page = page + (1 << page_order(page)) - 1;
|
|
||||||
move_freepages(page_zone(page), page, end_page,
|
|
||||||
MIGRATE_ISOLATE);
|
|
||||||
}
|
|
||||||
pfn += 1 << page_order(page);
|
pfn += 1 << page_order(page);
|
||||||
}
|
else if (skip_hwpoisoned_pages && PageHWPoison(page))
|
||||||
else if (page_count(page) == 0 &&
|
/* A HWPoisoned page cannot be also PageBuddy */
|
||||||
get_freepage_migratetype(page) == MIGRATE_ISOLATE)
|
|
||||||
pfn += 1;
|
|
||||||
else if (skip_hwpoisoned_pages && PageHWPoison(page)) {
|
|
||||||
/*
|
|
||||||
* The HWPoisoned page may be not in buddy
|
|
||||||
* system, and page_count() is not 0.
|
|
||||||
*/
|
|
||||||
pfn++;
|
pfn++;
|
||||||
continue;
|
|
||||||
}
|
|
||||||
else
|
else
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
16
mm/shmem.c
16
mm/shmem.c
|
@ -542,6 +542,21 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL_GPL(shmem_truncate_range);
|
EXPORT_SYMBOL_GPL(shmem_truncate_range);
|
||||||
|
|
||||||
|
static int shmem_getattr(struct vfsmount *mnt, struct dentry *dentry,
|
||||||
|
struct kstat *stat)
|
||||||
|
{
|
||||||
|
struct inode *inode = dentry->d_inode;
|
||||||
|
struct shmem_inode_info *info = SHMEM_I(inode);
|
||||||
|
|
||||||
|
spin_lock(&info->lock);
|
||||||
|
shmem_recalc_inode(inode);
|
||||||
|
spin_unlock(&info->lock);
|
||||||
|
|
||||||
|
generic_fillattr(inode, stat);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
|
static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
|
||||||
{
|
{
|
||||||
struct inode *inode = d_inode(dentry);
|
struct inode *inode = d_inode(dentry);
|
||||||
|
@ -3122,6 +3137,7 @@ static const struct file_operations shmem_file_operations = {
|
||||||
};
|
};
|
||||||
|
|
||||||
static const struct inode_operations shmem_inode_operations = {
|
static const struct inode_operations shmem_inode_operations = {
|
||||||
|
.getattr = shmem_getattr,
|
||||||
.setattr = shmem_setattr,
|
.setattr = shmem_setattr,
|
||||||
#ifdef CONFIG_TMPFS_XATTR
|
#ifdef CONFIG_TMPFS_XATTR
|
||||||
.setxattr = shmem_setxattr,
|
.setxattr = shmem_setxattr,
|
||||||
|
|
|
@ -1595,7 +1595,7 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
|
||||||
if (memcg_charge_slab(cachep, flags, cachep->gfporder))
|
if (memcg_charge_slab(cachep, flags, cachep->gfporder))
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
|
page = __alloc_pages_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
|
||||||
if (!page) {
|
if (!page) {
|
||||||
memcg_uncharge_slab(cachep, cachep->gfporder);
|
memcg_uncharge_slab(cachep, cachep->gfporder);
|
||||||
slab_out_of_memory(cachep, flags, nodeid);
|
slab_out_of_memory(cachep, flags, nodeid);
|
||||||
|
|
|
@ -500,7 +500,7 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
|
||||||
struct kmem_cache *root_cache)
|
struct kmem_cache *root_cache)
|
||||||
{
|
{
|
||||||
static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */
|
static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */
|
||||||
struct cgroup_subsys_state *css = mem_cgroup_css(memcg);
|
struct cgroup_subsys_state *css = &memcg->css;
|
||||||
struct memcg_cache_array *arr;
|
struct memcg_cache_array *arr;
|
||||||
struct kmem_cache *s = NULL;
|
struct kmem_cache *s = NULL;
|
||||||
char *cache_name;
|
char *cache_name;
|
||||||
|
@ -640,6 +640,9 @@ void kmem_cache_destroy(struct kmem_cache *s)
|
||||||
bool need_rcu_barrier = false;
|
bool need_rcu_barrier = false;
|
||||||
bool busy = false;
|
bool busy = false;
|
||||||
|
|
||||||
|
if (unlikely(!s))
|
||||||
|
return;
|
||||||
|
|
||||||
BUG_ON(!is_root_cache(s));
|
BUG_ON(!is_root_cache(s));
|
||||||
|
|
||||||
get_online_cpus();
|
get_online_cpus();
|
||||||
|
|
|
@ -45,7 +45,7 @@
|
||||||
* NUMA support in SLOB is fairly simplistic, pushing most of the real
|
* NUMA support in SLOB is fairly simplistic, pushing most of the real
|
||||||
* logic down to the page allocator, and simply doing the node accounting
|
* logic down to the page allocator, and simply doing the node accounting
|
||||||
* on the upper levels. In the event that a node id is explicitly
|
* on the upper levels. In the event that a node id is explicitly
|
||||||
* provided, alloc_pages_exact_node() with the specified node id is used
|
* provided, __alloc_pages_node() with the specified node id is used
|
||||||
* instead. The common case (or when the node id isn't explicitly provided)
|
* instead. The common case (or when the node id isn't explicitly provided)
|
||||||
* will default to the current node, as per numa_node_id().
|
* will default to the current node, as per numa_node_id().
|
||||||
*
|
*
|
||||||
|
@ -193,7 +193,7 @@ static void *slob_new_pages(gfp_t gfp, int order, int node)
|
||||||
|
|
||||||
#ifdef CONFIG_NUMA
|
#ifdef CONFIG_NUMA
|
||||||
if (node != NUMA_NO_NODE)
|
if (node != NUMA_NO_NODE)
|
||||||
page = alloc_pages_exact_node(node, gfp, order);
|
page = __alloc_pages_node(node, gfp, order);
|
||||||
else
|
else
|
||||||
#endif
|
#endif
|
||||||
page = alloc_pages(gfp, order);
|
page = alloc_pages(gfp, order);
|
||||||
|
|
|
@ -1334,7 +1334,7 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s,
|
||||||
if (node == NUMA_NO_NODE)
|
if (node == NUMA_NO_NODE)
|
||||||
page = alloc_pages(flags, order);
|
page = alloc_pages(flags, order);
|
||||||
else
|
else
|
||||||
page = alloc_pages_exact_node(node, flags, order);
|
page = __alloc_pages_node(node, flags, order);
|
||||||
|
|
||||||
if (!page)
|
if (!page)
|
||||||
memcg_uncharge_slab(s, order);
|
memcg_uncharge_slab(s, order);
|
||||||
|
|
|
@ -288,17 +288,14 @@ struct page * lookup_swap_cache(swp_entry_t entry)
|
||||||
return page;
|
return page;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
|
||||||
* Locate a page of swap in physical memory, reserving swap cache space
|
struct vm_area_struct *vma, unsigned long addr,
|
||||||
* and reading the disk if it is not already cached.
|
bool *new_page_allocated)
|
||||||
* A failure return means that either the page allocation failed or that
|
|
||||||
* the swap entry is no longer in use.
|
|
||||||
*/
|
|
||||||
struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
|
|
||||||
struct vm_area_struct *vma, unsigned long addr)
|
|
||||||
{
|
{
|
||||||
struct page *found_page, *new_page = NULL;
|
struct page *found_page, *new_page = NULL;
|
||||||
|
struct address_space *swapper_space = swap_address_space(entry);
|
||||||
int err;
|
int err;
|
||||||
|
*new_page_allocated = false;
|
||||||
|
|
||||||
do {
|
do {
|
||||||
/*
|
/*
|
||||||
|
@ -306,8 +303,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
|
||||||
* called after lookup_swap_cache() failed, re-calling
|
* called after lookup_swap_cache() failed, re-calling
|
||||||
* that would confuse statistics.
|
* that would confuse statistics.
|
||||||
*/
|
*/
|
||||||
found_page = find_get_page(swap_address_space(entry),
|
found_page = find_get_page(swapper_space, entry.val);
|
||||||
entry.val);
|
|
||||||
if (found_page)
|
if (found_page)
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
@ -366,7 +362,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
|
||||||
* Initiate read into locked page and return.
|
* Initiate read into locked page and return.
|
||||||
*/
|
*/
|
||||||
lru_cache_add_anon(new_page);
|
lru_cache_add_anon(new_page);
|
||||||
swap_readpage(new_page);
|
*new_page_allocated = true;
|
||||||
return new_page;
|
return new_page;
|
||||||
}
|
}
|
||||||
radix_tree_preload_end();
|
radix_tree_preload_end();
|
||||||
|
@ -384,6 +380,25 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
|
||||||
return found_page;
|
return found_page;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Locate a page of swap in physical memory, reserving swap cache space
|
||||||
|
* and reading the disk if it is not already cached.
|
||||||
|
* A failure return means that either the page allocation failed or that
|
||||||
|
* the swap entry is no longer in use.
|
||||||
|
*/
|
||||||
|
struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
|
||||||
|
struct vm_area_struct *vma, unsigned long addr)
|
||||||
|
{
|
||||||
|
bool page_was_allocated;
|
||||||
|
struct page *retpage = __read_swap_cache_async(entry, gfp_mask,
|
||||||
|
vma, addr, &page_was_allocated);
|
||||||
|
|
||||||
|
if (page_was_allocated)
|
||||||
|
swap_readpage(retpage);
|
||||||
|
|
||||||
|
return retpage;
|
||||||
|
}
|
||||||
|
|
||||||
static unsigned long swapin_nr_pages(unsigned long offset)
|
static unsigned long swapin_nr_pages(unsigned long offset)
|
||||||
{
|
{
|
||||||
static unsigned long prev_offset;
|
static unsigned long prev_offset;
|
||||||
|
|
|
@ -874,6 +874,48 @@ int page_swapcount(struct page *page)
|
||||||
return count;
|
return count;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* How many references to @entry are currently swapped out?
|
||||||
|
* This considers COUNT_CONTINUED so it returns exact answer.
|
||||||
|
*/
|
||||||
|
int swp_swapcount(swp_entry_t entry)
|
||||||
|
{
|
||||||
|
int count, tmp_count, n;
|
||||||
|
struct swap_info_struct *p;
|
||||||
|
struct page *page;
|
||||||
|
pgoff_t offset;
|
||||||
|
unsigned char *map;
|
||||||
|
|
||||||
|
p = swap_info_get(entry);
|
||||||
|
if (!p)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
count = swap_count(p->swap_map[swp_offset(entry)]);
|
||||||
|
if (!(count & COUNT_CONTINUED))
|
||||||
|
goto out;
|
||||||
|
|
||||||
|
count &= ~COUNT_CONTINUED;
|
||||||
|
n = SWAP_MAP_MAX + 1;
|
||||||
|
|
||||||
|
offset = swp_offset(entry);
|
||||||
|
page = vmalloc_to_page(p->swap_map + offset);
|
||||||
|
offset &= ~PAGE_MASK;
|
||||||
|
VM_BUG_ON(page_private(page) != SWP_CONTINUED);
|
||||||
|
|
||||||
|
do {
|
||||||
|
page = list_entry(page->lru.next, struct page, lru);
|
||||||
|
map = kmap_atomic(page);
|
||||||
|
tmp_count = map[offset];
|
||||||
|
kunmap_atomic(map);
|
||||||
|
|
||||||
|
count += (tmp_count & ~COUNT_CONTINUED) * n;
|
||||||
|
n *= (SWAP_CONT_MAX + 1);
|
||||||
|
} while (tmp_count & COUNT_CONTINUED);
|
||||||
|
out:
|
||||||
|
spin_unlock(&p->lock);
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We can write to an anon page without COW if there are no other references
|
* We can write to an anon page without COW if there are no other references
|
||||||
* to it. And as a side-effect, free up its swap: because the old content
|
* to it. And as a side-effect, free up its swap: because the old content
|
||||||
|
|
14
mm/vmscan.c
14
mm/vmscan.c
|
@ -175,7 +175,7 @@ static bool sane_reclaim(struct scan_control *sc)
|
||||||
if (!memcg)
|
if (!memcg)
|
||||||
return true;
|
return true;
|
||||||
#ifdef CONFIG_CGROUP_WRITEBACK
|
#ifdef CONFIG_CGROUP_WRITEBACK
|
||||||
if (cgroup_on_dfl(mem_cgroup_css(memcg)->cgroup))
|
if (memcg->css.cgroup)
|
||||||
return true;
|
return true;
|
||||||
#endif
|
#endif
|
||||||
return false;
|
return false;
|
||||||
|
@ -985,7 +985,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
|
||||||
* __GFP_IO|__GFP_FS for this reason); but more thought
|
* __GFP_IO|__GFP_FS for this reason); but more thought
|
||||||
* would probably show more reasons.
|
* would probably show more reasons.
|
||||||
*
|
*
|
||||||
* 3) Legacy memcg encounters a page that is not already marked
|
* 3) Legacy memcg encounters a page that is already marked
|
||||||
* PageReclaim. memcg does not have any dirty pages
|
* PageReclaim. memcg does not have any dirty pages
|
||||||
* throttling so we could easily OOM just because too many
|
* throttling so we could easily OOM just because too many
|
||||||
* pages are in writeback and there is nothing else to
|
* pages are in writeback and there is nothing else to
|
||||||
|
@ -1015,12 +1015,15 @@ static unsigned long shrink_page_list(struct list_head *page_list,
|
||||||
*/
|
*/
|
||||||
SetPageReclaim(page);
|
SetPageReclaim(page);
|
||||||
nr_writeback++;
|
nr_writeback++;
|
||||||
|
|
||||||
goto keep_locked;
|
goto keep_locked;
|
||||||
|
|
||||||
/* Case 3 above */
|
/* Case 3 above */
|
||||||
} else {
|
} else {
|
||||||
|
unlock_page(page);
|
||||||
wait_on_page_writeback(page);
|
wait_on_page_writeback(page);
|
||||||
|
/* then go back and try same page again */
|
||||||
|
list_add_tail(&page->lru, page_list);
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1196,7 +1199,7 @@ cull_mlocked:
|
||||||
if (PageSwapCache(page))
|
if (PageSwapCache(page))
|
||||||
try_to_free_swap(page);
|
try_to_free_swap(page);
|
||||||
unlock_page(page);
|
unlock_page(page);
|
||||||
putback_lru_page(page);
|
list_add(&page->lru, &ret_pages);
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
activate_locked:
|
activate_locked:
|
||||||
|
@ -1359,7 +1362,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
|
||||||
unsigned long nr_taken = 0;
|
unsigned long nr_taken = 0;
|
||||||
unsigned long scan;
|
unsigned long scan;
|
||||||
|
|
||||||
for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
|
for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan &&
|
||||||
|
!list_empty(src); scan++) {
|
||||||
struct page *page;
|
struct page *page;
|
||||||
int nr_pages;
|
int nr_pages;
|
||||||
|
|
||||||
|
|
10
mm/zbud.c
10
mm/zbud.c
|
@ -96,10 +96,10 @@ struct zbud_pool {
|
||||||
struct list_head buddied;
|
struct list_head buddied;
|
||||||
struct list_head lru;
|
struct list_head lru;
|
||||||
u64 pages_nr;
|
u64 pages_nr;
|
||||||
struct zbud_ops *ops;
|
const struct zbud_ops *ops;
|
||||||
#ifdef CONFIG_ZPOOL
|
#ifdef CONFIG_ZPOOL
|
||||||
struct zpool *zpool;
|
struct zpool *zpool;
|
||||||
struct zpool_ops *zpool_ops;
|
const struct zpool_ops *zpool_ops;
|
||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -133,12 +133,12 @@ static int zbud_zpool_evict(struct zbud_pool *pool, unsigned long handle)
|
||||||
return -ENOENT;
|
return -ENOENT;
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct zbud_ops zbud_zpool_ops = {
|
static const struct zbud_ops zbud_zpool_ops = {
|
||||||
.evict = zbud_zpool_evict
|
.evict = zbud_zpool_evict
|
||||||
};
|
};
|
||||||
|
|
||||||
static void *zbud_zpool_create(char *name, gfp_t gfp,
|
static void *zbud_zpool_create(char *name, gfp_t gfp,
|
||||||
struct zpool_ops *zpool_ops,
|
const struct zpool_ops *zpool_ops,
|
||||||
struct zpool *zpool)
|
struct zpool *zpool)
|
||||||
{
|
{
|
||||||
struct zbud_pool *pool;
|
struct zbud_pool *pool;
|
||||||
|
@ -302,7 +302,7 @@ static int num_free_chunks(struct zbud_header *zhdr)
|
||||||
* Return: pointer to the new zbud pool or NULL if the metadata allocation
|
* Return: pointer to the new zbud pool or NULL if the metadata allocation
|
||||||
* failed.
|
* failed.
|
||||||
*/
|
*/
|
||||||
struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops)
|
struct zbud_pool *zbud_create_pool(gfp_t gfp, const struct zbud_ops *ops)
|
||||||
{
|
{
|
||||||
struct zbud_pool *pool;
|
struct zbud_pool *pool;
|
||||||
int i;
|
int i;
|
||||||
|
|
18
mm/zpool.c
18
mm/zpool.c
|
@ -22,7 +22,7 @@ struct zpool {
|
||||||
|
|
||||||
struct zpool_driver *driver;
|
struct zpool_driver *driver;
|
||||||
void *pool;
|
void *pool;
|
||||||
struct zpool_ops *ops;
|
const struct zpool_ops *ops;
|
||||||
|
|
||||||
struct list_head list;
|
struct list_head list;
|
||||||
};
|
};
|
||||||
|
@ -115,7 +115,7 @@ static void zpool_put_driver(struct zpool_driver *driver)
|
||||||
* Returns: New zpool on success, NULL on failure.
|
* Returns: New zpool on success, NULL on failure.
|
||||||
*/
|
*/
|
||||||
struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp,
|
struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp,
|
||||||
struct zpool_ops *ops)
|
const struct zpool_ops *ops)
|
||||||
{
|
{
|
||||||
struct zpool_driver *driver;
|
struct zpool_driver *driver;
|
||||||
struct zpool *zpool;
|
struct zpool *zpool;
|
||||||
|
@ -320,20 +320,6 @@ u64 zpool_get_total_size(struct zpool *zpool)
|
||||||
return zpool->driver->total_size(zpool->pool);
|
return zpool->driver->total_size(zpool->pool);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int __init init_zpool(void)
|
|
||||||
{
|
|
||||||
pr_info("loaded\n");
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void __exit exit_zpool(void)
|
|
||||||
{
|
|
||||||
pr_info("unloaded\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
module_init(init_zpool);
|
|
||||||
module_exit(exit_zpool);
|
|
||||||
|
|
||||||
MODULE_LICENSE("GPL");
|
MODULE_LICENSE("GPL");
|
||||||
MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>");
|
MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>");
|
||||||
MODULE_DESCRIPTION("Common API for compressed memory storage");
|
MODULE_DESCRIPTION("Common API for compressed memory storage");
|
||||||
|
|
235
mm/zsmalloc.c
235
mm/zsmalloc.c
|
@ -169,14 +169,12 @@ enum zs_stat_type {
|
||||||
NR_ZS_STAT_TYPE,
|
NR_ZS_STAT_TYPE,
|
||||||
};
|
};
|
||||||
|
|
||||||
#ifdef CONFIG_ZSMALLOC_STAT
|
|
||||||
|
|
||||||
static struct dentry *zs_stat_root;
|
|
||||||
|
|
||||||
struct zs_size_stat {
|
struct zs_size_stat {
|
||||||
unsigned long objs[NR_ZS_STAT_TYPE];
|
unsigned long objs[NR_ZS_STAT_TYPE];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#ifdef CONFIG_ZSMALLOC_STAT
|
||||||
|
static struct dentry *zs_stat_root;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -201,6 +199,8 @@ static int zs_size_classes;
|
||||||
static const int fullness_threshold_frac = 4;
|
static const int fullness_threshold_frac = 4;
|
||||||
|
|
||||||
struct size_class {
|
struct size_class {
|
||||||
|
spinlock_t lock;
|
||||||
|
struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS];
|
||||||
/*
|
/*
|
||||||
* Size of objects stored in this class. Must be multiple
|
* Size of objects stored in this class. Must be multiple
|
||||||
* of ZS_ALIGN.
|
* of ZS_ALIGN.
|
||||||
|
@ -210,16 +210,10 @@ struct size_class {
|
||||||
|
|
||||||
/* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
|
/* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
|
||||||
int pages_per_zspage;
|
int pages_per_zspage;
|
||||||
|
struct zs_size_stat stats;
|
||||||
|
|
||||||
/* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
|
/* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
|
||||||
bool huge;
|
bool huge;
|
||||||
|
|
||||||
#ifdef CONFIG_ZSMALLOC_STAT
|
|
||||||
struct zs_size_stat stats;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
spinlock_t lock;
|
|
||||||
|
|
||||||
struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS];
|
|
||||||
};
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -251,6 +245,15 @@ struct zs_pool {
|
||||||
gfp_t flags; /* allocation flags used when growing pool */
|
gfp_t flags; /* allocation flags used when growing pool */
|
||||||
atomic_long_t pages_allocated;
|
atomic_long_t pages_allocated;
|
||||||
|
|
||||||
|
struct zs_pool_stats stats;
|
||||||
|
|
||||||
|
/* Compact classes */
|
||||||
|
struct shrinker shrinker;
|
||||||
|
/*
|
||||||
|
* To signify that register_shrinker() was successful
|
||||||
|
* and unregister_shrinker() will not Oops.
|
||||||
|
*/
|
||||||
|
bool shrinker_enabled;
|
||||||
#ifdef CONFIG_ZSMALLOC_STAT
|
#ifdef CONFIG_ZSMALLOC_STAT
|
||||||
struct dentry *stat_dentry;
|
struct dentry *stat_dentry;
|
||||||
#endif
|
#endif
|
||||||
|
@ -285,8 +288,7 @@ static int create_handle_cache(struct zs_pool *pool)
|
||||||
|
|
||||||
static void destroy_handle_cache(struct zs_pool *pool)
|
static void destroy_handle_cache(struct zs_pool *pool)
|
||||||
{
|
{
|
||||||
if (pool->handle_cachep)
|
kmem_cache_destroy(pool->handle_cachep);
|
||||||
kmem_cache_destroy(pool->handle_cachep);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static unsigned long alloc_handle(struct zs_pool *pool)
|
static unsigned long alloc_handle(struct zs_pool *pool)
|
||||||
|
@ -309,7 +311,8 @@ static void record_obj(unsigned long handle, unsigned long obj)
|
||||||
|
|
||||||
#ifdef CONFIG_ZPOOL
|
#ifdef CONFIG_ZPOOL
|
||||||
|
|
||||||
static void *zs_zpool_create(char *name, gfp_t gfp, struct zpool_ops *zpool_ops,
|
static void *zs_zpool_create(char *name, gfp_t gfp,
|
||||||
|
const struct zpool_ops *zpool_ops,
|
||||||
struct zpool *zpool)
|
struct zpool *zpool)
|
||||||
{
|
{
|
||||||
return zs_create_pool(name, gfp);
|
return zs_create_pool(name, gfp);
|
||||||
|
@ -441,8 +444,6 @@ static int get_size_class_index(int size)
|
||||||
return min(zs_size_classes - 1, idx);
|
return min(zs_size_classes - 1, idx);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef CONFIG_ZSMALLOC_STAT
|
|
||||||
|
|
||||||
static inline void zs_stat_inc(struct size_class *class,
|
static inline void zs_stat_inc(struct size_class *class,
|
||||||
enum zs_stat_type type, unsigned long cnt)
|
enum zs_stat_type type, unsigned long cnt)
|
||||||
{
|
{
|
||||||
|
@ -461,6 +462,8 @@ static inline unsigned long zs_stat_get(struct size_class *class,
|
||||||
return class->stats.objs[type];
|
return class->stats.objs[type];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef CONFIG_ZSMALLOC_STAT
|
||||||
|
|
||||||
static int __init zs_stat_init(void)
|
static int __init zs_stat_init(void)
|
||||||
{
|
{
|
||||||
if (!debugfs_initialized())
|
if (!debugfs_initialized())
|
||||||
|
@ -576,23 +579,6 @@ static void zs_pool_stat_destroy(struct zs_pool *pool)
|
||||||
}
|
}
|
||||||
|
|
||||||
#else /* CONFIG_ZSMALLOC_STAT */
|
#else /* CONFIG_ZSMALLOC_STAT */
|
||||||
|
|
||||||
static inline void zs_stat_inc(struct size_class *class,
|
|
||||||
enum zs_stat_type type, unsigned long cnt)
|
|
||||||
{
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline void zs_stat_dec(struct size_class *class,
|
|
||||||
enum zs_stat_type type, unsigned long cnt)
|
|
||||||
{
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline unsigned long zs_stat_get(struct size_class *class,
|
|
||||||
enum zs_stat_type type)
|
|
||||||
{
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int __init zs_stat_init(void)
|
static int __init zs_stat_init(void)
|
||||||
{
|
{
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -610,7 +596,6 @@ static inline int zs_pool_stat_create(char *name, struct zs_pool *pool)
|
||||||
static inline void zs_pool_stat_destroy(struct zs_pool *pool)
|
static inline void zs_pool_stat_destroy(struct zs_pool *pool)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
@ -658,13 +643,22 @@ static void insert_zspage(struct page *page, struct size_class *class,
|
||||||
if (fullness >= _ZS_NR_FULLNESS_GROUPS)
|
if (fullness >= _ZS_NR_FULLNESS_GROUPS)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
head = &class->fullness_list[fullness];
|
|
||||||
if (*head)
|
|
||||||
list_add_tail(&page->lru, &(*head)->lru);
|
|
||||||
|
|
||||||
*head = page;
|
|
||||||
zs_stat_inc(class, fullness == ZS_ALMOST_EMPTY ?
|
zs_stat_inc(class, fullness == ZS_ALMOST_EMPTY ?
|
||||||
CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1);
|
CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1);
|
||||||
|
|
||||||
|
head = &class->fullness_list[fullness];
|
||||||
|
if (!*head) {
|
||||||
|
*head = page;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We want to see more ZS_FULL pages and less almost
|
||||||
|
* empty/full. Put pages with higher ->inuse first.
|
||||||
|
*/
|
||||||
|
list_add_tail(&page->lru, &(*head)->lru);
|
||||||
|
if (page->inuse >= (*head)->inuse)
|
||||||
|
*head = page;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -1495,7 +1489,7 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL_GPL(zs_free);
|
EXPORT_SYMBOL_GPL(zs_free);
|
||||||
|
|
||||||
static void zs_object_copy(unsigned long src, unsigned long dst,
|
static void zs_object_copy(unsigned long dst, unsigned long src,
|
||||||
struct size_class *class)
|
struct size_class *class)
|
||||||
{
|
{
|
||||||
struct page *s_page, *d_page;
|
struct page *s_page, *d_page;
|
||||||
|
@ -1602,8 +1596,6 @@ struct zs_compact_control {
|
||||||
/* Starting object index within @s_page which used for live object
|
/* Starting object index within @s_page which used for live object
|
||||||
* in the subpage. */
|
* in the subpage. */
|
||||||
int index;
|
int index;
|
||||||
/* how many of objects are migrated */
|
|
||||||
int nr_migrated;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
|
static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
|
||||||
|
@ -1614,7 +1606,6 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
|
||||||
struct page *s_page = cc->s_page;
|
struct page *s_page = cc->s_page;
|
||||||
struct page *d_page = cc->d_page;
|
struct page *d_page = cc->d_page;
|
||||||
unsigned long index = cc->index;
|
unsigned long index = cc->index;
|
||||||
int nr_migrated = 0;
|
|
||||||
int ret = 0;
|
int ret = 0;
|
||||||
|
|
||||||
while (1) {
|
while (1) {
|
||||||
|
@ -1636,23 +1627,21 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
|
||||||
|
|
||||||
used_obj = handle_to_obj(handle);
|
used_obj = handle_to_obj(handle);
|
||||||
free_obj = obj_malloc(d_page, class, handle);
|
free_obj = obj_malloc(d_page, class, handle);
|
||||||
zs_object_copy(used_obj, free_obj, class);
|
zs_object_copy(free_obj, used_obj, class);
|
||||||
index++;
|
index++;
|
||||||
record_obj(handle, free_obj);
|
record_obj(handle, free_obj);
|
||||||
unpin_tag(handle);
|
unpin_tag(handle);
|
||||||
obj_free(pool, class, used_obj);
|
obj_free(pool, class, used_obj);
|
||||||
nr_migrated++;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Remember last position in this iteration */
|
/* Remember last position in this iteration */
|
||||||
cc->s_page = s_page;
|
cc->s_page = s_page;
|
||||||
cc->index = index;
|
cc->index = index;
|
||||||
cc->nr_migrated = nr_migrated;
|
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct page *alloc_target_page(struct size_class *class)
|
static struct page *isolate_target_page(struct size_class *class)
|
||||||
{
|
{
|
||||||
int i;
|
int i;
|
||||||
struct page *page;
|
struct page *page;
|
||||||
|
@ -1668,8 +1657,17 @@ static struct page *alloc_target_page(struct size_class *class)
|
||||||
return page;
|
return page;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void putback_zspage(struct zs_pool *pool, struct size_class *class,
|
/*
|
||||||
struct page *first_page)
|
* putback_zspage - add @first_page into right class's fullness list
|
||||||
|
* @pool: target pool
|
||||||
|
* @class: destination class
|
||||||
|
* @first_page: target page
|
||||||
|
*
|
||||||
|
* Return @fist_page's fullness_group
|
||||||
|
*/
|
||||||
|
static enum fullness_group putback_zspage(struct zs_pool *pool,
|
||||||
|
struct size_class *class,
|
||||||
|
struct page *first_page)
|
||||||
{
|
{
|
||||||
enum fullness_group fullness;
|
enum fullness_group fullness;
|
||||||
|
|
||||||
|
@ -1687,50 +1685,72 @@ static void putback_zspage(struct zs_pool *pool, struct size_class *class,
|
||||||
|
|
||||||
free_zspage(first_page);
|
free_zspage(first_page);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return fullness;
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct page *isolate_source_page(struct size_class *class)
|
static struct page *isolate_source_page(struct size_class *class)
|
||||||
{
|
{
|
||||||
struct page *page;
|
int i;
|
||||||
|
struct page *page = NULL;
|
||||||
|
|
||||||
page = class->fullness_list[ZS_ALMOST_EMPTY];
|
for (i = ZS_ALMOST_EMPTY; i >= ZS_ALMOST_FULL; i--) {
|
||||||
if (page)
|
page = class->fullness_list[i];
|
||||||
remove_zspage(page, class, ZS_ALMOST_EMPTY);
|
if (!page)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
remove_zspage(page, class, i);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
return page;
|
return page;
|
||||||
}
|
}
|
||||||
|
|
||||||
static unsigned long __zs_compact(struct zs_pool *pool,
|
/*
|
||||||
struct size_class *class)
|
*
|
||||||
|
* Based on the number of unused allocated objects calculate
|
||||||
|
* and return the number of pages that we can free.
|
||||||
|
*/
|
||||||
|
static unsigned long zs_can_compact(struct size_class *class)
|
||||||
|
{
|
||||||
|
unsigned long obj_wasted;
|
||||||
|
|
||||||
|
obj_wasted = zs_stat_get(class, OBJ_ALLOCATED) -
|
||||||
|
zs_stat_get(class, OBJ_USED);
|
||||||
|
|
||||||
|
obj_wasted /= get_maxobj_per_zspage(class->size,
|
||||||
|
class->pages_per_zspage);
|
||||||
|
|
||||||
|
return obj_wasted * class->pages_per_zspage;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void __zs_compact(struct zs_pool *pool, struct size_class *class)
|
||||||
{
|
{
|
||||||
int nr_to_migrate;
|
|
||||||
struct zs_compact_control cc;
|
struct zs_compact_control cc;
|
||||||
struct page *src_page;
|
struct page *src_page;
|
||||||
struct page *dst_page = NULL;
|
struct page *dst_page = NULL;
|
||||||
unsigned long nr_total_migrated = 0;
|
|
||||||
|
|
||||||
spin_lock(&class->lock);
|
spin_lock(&class->lock);
|
||||||
while ((src_page = isolate_source_page(class))) {
|
while ((src_page = isolate_source_page(class))) {
|
||||||
|
|
||||||
BUG_ON(!is_first_page(src_page));
|
BUG_ON(!is_first_page(src_page));
|
||||||
|
|
||||||
/* The goal is to migrate all live objects in source page */
|
if (!zs_can_compact(class))
|
||||||
nr_to_migrate = src_page->inuse;
|
break;
|
||||||
|
|
||||||
cc.index = 0;
|
cc.index = 0;
|
||||||
cc.s_page = src_page;
|
cc.s_page = src_page;
|
||||||
|
|
||||||
while ((dst_page = alloc_target_page(class))) {
|
while ((dst_page = isolate_target_page(class))) {
|
||||||
cc.d_page = dst_page;
|
cc.d_page = dst_page;
|
||||||
/*
|
/*
|
||||||
* If there is no more space in dst_page, try to
|
* If there is no more space in dst_page, resched
|
||||||
* allocate another zspage.
|
* and see if anyone had allocated another zspage.
|
||||||
*/
|
*/
|
||||||
if (!migrate_zspage(pool, class, &cc))
|
if (!migrate_zspage(pool, class, &cc))
|
||||||
break;
|
break;
|
||||||
|
|
||||||
putback_zspage(pool, class, dst_page);
|
putback_zspage(pool, class, dst_page);
|
||||||
nr_total_migrated += cc.nr_migrated;
|
|
||||||
nr_to_migrate -= cc.nr_migrated;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Stop if we couldn't find slot */
|
/* Stop if we couldn't find slot */
|
||||||
|
@ -1738,9 +1758,9 @@ static unsigned long __zs_compact(struct zs_pool *pool,
|
||||||
break;
|
break;
|
||||||
|
|
||||||
putback_zspage(pool, class, dst_page);
|
putback_zspage(pool, class, dst_page);
|
||||||
putback_zspage(pool, class, src_page);
|
if (putback_zspage(pool, class, src_page) == ZS_EMPTY)
|
||||||
|
pool->stats.pages_compacted += class->pages_per_zspage;
|
||||||
spin_unlock(&class->lock);
|
spin_unlock(&class->lock);
|
||||||
nr_total_migrated += cc.nr_migrated;
|
|
||||||
cond_resched();
|
cond_resched();
|
||||||
spin_lock(&class->lock);
|
spin_lock(&class->lock);
|
||||||
}
|
}
|
||||||
|
@ -1749,14 +1769,11 @@ static unsigned long __zs_compact(struct zs_pool *pool,
|
||||||
putback_zspage(pool, class, src_page);
|
putback_zspage(pool, class, src_page);
|
||||||
|
|
||||||
spin_unlock(&class->lock);
|
spin_unlock(&class->lock);
|
||||||
|
|
||||||
return nr_total_migrated;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned long zs_compact(struct zs_pool *pool)
|
unsigned long zs_compact(struct zs_pool *pool)
|
||||||
{
|
{
|
||||||
int i;
|
int i;
|
||||||
unsigned long nr_migrated = 0;
|
|
||||||
struct size_class *class;
|
struct size_class *class;
|
||||||
|
|
||||||
for (i = zs_size_classes - 1; i >= 0; i--) {
|
for (i = zs_size_classes - 1; i >= 0; i--) {
|
||||||
|
@ -1765,13 +1782,80 @@ unsigned long zs_compact(struct zs_pool *pool)
|
||||||
continue;
|
continue;
|
||||||
if (class->index != i)
|
if (class->index != i)
|
||||||
continue;
|
continue;
|
||||||
nr_migrated += __zs_compact(pool, class);
|
__zs_compact(pool, class);
|
||||||
}
|
}
|
||||||
|
|
||||||
return nr_migrated;
|
return pool->stats.pages_compacted;
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL_GPL(zs_compact);
|
EXPORT_SYMBOL_GPL(zs_compact);
|
||||||
|
|
||||||
|
void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats)
|
||||||
|
{
|
||||||
|
memcpy(stats, &pool->stats, sizeof(struct zs_pool_stats));
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL_GPL(zs_pool_stats);
|
||||||
|
|
||||||
|
static unsigned long zs_shrinker_scan(struct shrinker *shrinker,
|
||||||
|
struct shrink_control *sc)
|
||||||
|
{
|
||||||
|
unsigned long pages_freed;
|
||||||
|
struct zs_pool *pool = container_of(shrinker, struct zs_pool,
|
||||||
|
shrinker);
|
||||||
|
|
||||||
|
pages_freed = pool->stats.pages_compacted;
|
||||||
|
/*
|
||||||
|
* Compact classes and calculate compaction delta.
|
||||||
|
* Can run concurrently with a manually triggered
|
||||||
|
* (by user) compaction.
|
||||||
|
*/
|
||||||
|
pages_freed = zs_compact(pool) - pages_freed;
|
||||||
|
|
||||||
|
return pages_freed ? pages_freed : SHRINK_STOP;
|
||||||
|
}
|
||||||
|
|
||||||
|
static unsigned long zs_shrinker_count(struct shrinker *shrinker,
|
||||||
|
struct shrink_control *sc)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
struct size_class *class;
|
||||||
|
unsigned long pages_to_free = 0;
|
||||||
|
struct zs_pool *pool = container_of(shrinker, struct zs_pool,
|
||||||
|
shrinker);
|
||||||
|
|
||||||
|
if (!pool->shrinker_enabled)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
for (i = zs_size_classes - 1; i >= 0; i--) {
|
||||||
|
class = pool->size_class[i];
|
||||||
|
if (!class)
|
||||||
|
continue;
|
||||||
|
if (class->index != i)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
pages_to_free += zs_can_compact(class);
|
||||||
|
}
|
||||||
|
|
||||||
|
return pages_to_free;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void zs_unregister_shrinker(struct zs_pool *pool)
|
||||||
|
{
|
||||||
|
if (pool->shrinker_enabled) {
|
||||||
|
unregister_shrinker(&pool->shrinker);
|
||||||
|
pool->shrinker_enabled = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static int zs_register_shrinker(struct zs_pool *pool)
|
||||||
|
{
|
||||||
|
pool->shrinker.scan_objects = zs_shrinker_scan;
|
||||||
|
pool->shrinker.count_objects = zs_shrinker_count;
|
||||||
|
pool->shrinker.batch = 0;
|
||||||
|
pool->shrinker.seeks = DEFAULT_SEEKS;
|
||||||
|
|
||||||
|
return register_shrinker(&pool->shrinker);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* zs_create_pool - Creates an allocation pool to work from.
|
* zs_create_pool - Creates an allocation pool to work from.
|
||||||
* @flags: allocation flags used to allocate pool metadata
|
* @flags: allocation flags used to allocate pool metadata
|
||||||
|
@ -1857,6 +1941,12 @@ struct zs_pool *zs_create_pool(char *name, gfp_t flags)
|
||||||
if (zs_pool_stat_create(name, pool))
|
if (zs_pool_stat_create(name, pool))
|
||||||
goto err;
|
goto err;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Not critical, we still can use the pool
|
||||||
|
* and user can trigger compaction manually.
|
||||||
|
*/
|
||||||
|
if (zs_register_shrinker(pool) == 0)
|
||||||
|
pool->shrinker_enabled = true;
|
||||||
return pool;
|
return pool;
|
||||||
|
|
||||||
err:
|
err:
|
||||||
|
@ -1869,6 +1959,7 @@ void zs_destroy_pool(struct zs_pool *pool)
|
||||||
{
|
{
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
|
zs_unregister_shrinker(pool);
|
||||||
zs_pool_stat_destroy(pool);
|
zs_pool_stat_destroy(pool);
|
||||||
|
|
||||||
for (i = 0; i < zs_size_classes; i++) {
|
for (i = 0; i < zs_size_classes; i++) {
|
||||||
|
|
75
mm/zswap.c
75
mm/zswap.c
|
@ -446,75 +446,14 @@ enum zswap_get_swap_ret {
|
||||||
static int zswap_get_swap_cache_page(swp_entry_t entry,
|
static int zswap_get_swap_cache_page(swp_entry_t entry,
|
||||||
struct page **retpage)
|
struct page **retpage)
|
||||||
{
|
{
|
||||||
struct page *found_page, *new_page = NULL;
|
bool page_was_allocated;
|
||||||
struct address_space *swapper_space = swap_address_space(entry);
|
|
||||||
int err;
|
|
||||||
|
|
||||||
*retpage = NULL;
|
*retpage = __read_swap_cache_async(entry, GFP_KERNEL,
|
||||||
do {
|
NULL, 0, &page_was_allocated);
|
||||||
/*
|
if (page_was_allocated)
|
||||||
* First check the swap cache. Since this is normally
|
return ZSWAP_SWAPCACHE_NEW;
|
||||||
* called after lookup_swap_cache() failed, re-calling
|
if (!*retpage)
|
||||||
* that would confuse statistics.
|
|
||||||
*/
|
|
||||||
found_page = find_get_page(swapper_space, entry.val);
|
|
||||||
if (found_page)
|
|
||||||
break;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Get a new page to read into from swap.
|
|
||||||
*/
|
|
||||||
if (!new_page) {
|
|
||||||
new_page = alloc_page(GFP_KERNEL);
|
|
||||||
if (!new_page)
|
|
||||||
break; /* Out of memory */
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* call radix_tree_preload() while we can wait.
|
|
||||||
*/
|
|
||||||
err = radix_tree_preload(GFP_KERNEL);
|
|
||||||
if (err)
|
|
||||||
break;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Swap entry may have been freed since our caller observed it.
|
|
||||||
*/
|
|
||||||
err = swapcache_prepare(entry);
|
|
||||||
if (err == -EEXIST) { /* seems racy */
|
|
||||||
radix_tree_preload_end();
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (err) { /* swp entry is obsolete ? */
|
|
||||||
radix_tree_preload_end();
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* May fail (-ENOMEM) if radix-tree node allocation failed. */
|
|
||||||
__set_page_locked(new_page);
|
|
||||||
SetPageSwapBacked(new_page);
|
|
||||||
err = __add_to_swap_cache(new_page, entry);
|
|
||||||
if (likely(!err)) {
|
|
||||||
radix_tree_preload_end();
|
|
||||||
lru_cache_add_anon(new_page);
|
|
||||||
*retpage = new_page;
|
|
||||||
return ZSWAP_SWAPCACHE_NEW;
|
|
||||||
}
|
|
||||||
radix_tree_preload_end();
|
|
||||||
ClearPageSwapBacked(new_page);
|
|
||||||
__clear_page_locked(new_page);
|
|
||||||
/*
|
|
||||||
* add_to_swap_cache() doesn't return -EEXIST, so we can safely
|
|
||||||
* clear SWAP_HAS_CACHE flag.
|
|
||||||
*/
|
|
||||||
swapcache_free(entry);
|
|
||||||
} while (err != -ENOMEM);
|
|
||||||
|
|
||||||
if (new_page)
|
|
||||||
page_cache_release(new_page);
|
|
||||||
if (!found_page)
|
|
||||||
return ZSWAP_SWAPCACHE_FAIL;
|
return ZSWAP_SWAPCACHE_FAIL;
|
||||||
*retpage = found_page;
|
|
||||||
return ZSWAP_SWAPCACHE_EXIST;
|
return ZSWAP_SWAPCACHE_EXIST;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -816,7 +755,7 @@ static void zswap_frontswap_invalidate_area(unsigned type)
|
||||||
zswap_trees[type] = NULL;
|
zswap_trees[type] = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct zpool_ops zswap_zpool_ops = {
|
static const struct zpool_ops zswap_zpool_ops = {
|
||||||
.evict = zswap_writeback_entry
|
.evict = zswap_writeback_entry
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,84 @@
|
||||||
|
///
|
||||||
|
/// Use *_pool_zalloc rather than *_pool_alloc followed by memset with 0
|
||||||
|
///
|
||||||
|
// Copyright: (C) 2015 Intel Corp. GPLv2.
|
||||||
|
// Options: --no-includes --include-headers
|
||||||
|
//
|
||||||
|
// Keywords: dma_pool_zalloc, pci_pool_zalloc
|
||||||
|
//
|
||||||
|
|
||||||
|
virtual context
|
||||||
|
virtual patch
|
||||||
|
virtual org
|
||||||
|
virtual report
|
||||||
|
|
||||||
|
//----------------------------------------------------------
|
||||||
|
// For context mode
|
||||||
|
//----------------------------------------------------------
|
||||||
|
|
||||||
|
@depends on context@
|
||||||
|
expression x;
|
||||||
|
statement S;
|
||||||
|
@@
|
||||||
|
|
||||||
|
* x = \(dma_pool_alloc\|pci_pool_alloc\)(...);
|
||||||
|
if ((x==NULL) || ...) S
|
||||||
|
* memset(x,0, ...);
|
||||||
|
|
||||||
|
//----------------------------------------------------------
|
||||||
|
// For patch mode
|
||||||
|
//----------------------------------------------------------
|
||||||
|
|
||||||
|
@depends on patch@
|
||||||
|
expression x;
|
||||||
|
expression a,b,c;
|
||||||
|
statement S;
|
||||||
|
@@
|
||||||
|
|
||||||
|
- x = dma_pool_alloc(a,b,c);
|
||||||
|
+ x = dma_pool_zalloc(a,b,c);
|
||||||
|
if ((x==NULL) || ...) S
|
||||||
|
- memset(x,0,...);
|
||||||
|
|
||||||
|
@depends on patch@
|
||||||
|
expression x;
|
||||||
|
expression a,b,c;
|
||||||
|
statement S;
|
||||||
|
@@
|
||||||
|
|
||||||
|
- x = pci_pool_alloc(a,b,c);
|
||||||
|
+ x = pci_pool_zalloc(a,b,c);
|
||||||
|
if ((x==NULL) || ...) S
|
||||||
|
- memset(x,0,...);
|
||||||
|
|
||||||
|
//----------------------------------------------------------
|
||||||
|
// For org and report mode
|
||||||
|
//----------------------------------------------------------
|
||||||
|
|
||||||
|
@r depends on org || report@
|
||||||
|
expression x;
|
||||||
|
expression a,b,c;
|
||||||
|
statement S;
|
||||||
|
position p;
|
||||||
|
@@
|
||||||
|
|
||||||
|
x = @p\(dma_pool_alloc\|pci_pool_alloc\)(a,b,c);
|
||||||
|
if ((x==NULL) || ...) S
|
||||||
|
memset(x,0, ...);
|
||||||
|
|
||||||
|
@script:python depends on org@
|
||||||
|
p << r.p;
|
||||||
|
x << r.x;
|
||||||
|
@@
|
||||||
|
|
||||||
|
msg="%s" % (x)
|
||||||
|
msg_safe=msg.replace("[","@(").replace("]",")")
|
||||||
|
coccilib.org.print_todo(p[0], msg_safe)
|
||||||
|
|
||||||
|
@script:python depends on report@
|
||||||
|
p << r.p;
|
||||||
|
x << r.x;
|
||||||
|
@@
|
||||||
|
|
||||||
|
msg="WARNING: *_pool_zalloc should be used for %s, instead of *_pool_alloc/memset" % (x)
|
||||||
|
coccilib.report.print_report(p[0], msg)
|
|
@ -4,7 +4,6 @@ CFLAGS = -Wall
|
||||||
BINARIES = compaction_test
|
BINARIES = compaction_test
|
||||||
BINARIES += hugepage-mmap
|
BINARIES += hugepage-mmap
|
||||||
BINARIES += hugepage-shm
|
BINARIES += hugepage-shm
|
||||||
BINARIES += hugetlbfstest
|
|
||||||
BINARIES += map_hugetlb
|
BINARIES += map_hugetlb
|
||||||
BINARIES += thuge-gen
|
BINARIES += thuge-gen
|
||||||
BINARIES += transhuge-stress
|
BINARIES += transhuge-stress
|
||||||
|
|
|
@ -1,86 +0,0 @@
|
||||||
#define _GNU_SOURCE
|
|
||||||
#include <assert.h>
|
|
||||||
#include <fcntl.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <stdlib.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include <sys/mman.h>
|
|
||||||
#include <sys/stat.h>
|
|
||||||
#include <sys/types.h>
|
|
||||||
#include <unistd.h>
|
|
||||||
|
|
||||||
typedef unsigned long long u64;
|
|
||||||
|
|
||||||
static size_t length = 1 << 24;
|
|
||||||
|
|
||||||
static u64 read_rss(void)
|
|
||||||
{
|
|
||||||
char buf[4096], *s = buf;
|
|
||||||
int i, fd;
|
|
||||||
u64 rss;
|
|
||||||
|
|
||||||
fd = open("/proc/self/statm", O_RDONLY);
|
|
||||||
assert(fd > 2);
|
|
||||||
memset(buf, 0, sizeof(buf));
|
|
||||||
read(fd, buf, sizeof(buf) - 1);
|
|
||||||
for (i = 0; i < 1; i++)
|
|
||||||
s = strchr(s, ' ') + 1;
|
|
||||||
rss = strtoull(s, NULL, 10);
|
|
||||||
return rss << 12; /* assumes 4k pagesize */
|
|
||||||
}
|
|
||||||
|
|
||||||
static void do_mmap(int fd, int extra_flags, int unmap)
|
|
||||||
{
|
|
||||||
int *p;
|
|
||||||
int flags = MAP_PRIVATE | MAP_POPULATE | extra_flags;
|
|
||||||
u64 before, after;
|
|
||||||
int ret;
|
|
||||||
|
|
||||||
before = read_rss();
|
|
||||||
p = mmap(NULL, length, PROT_READ | PROT_WRITE, flags, fd, 0);
|
|
||||||
assert(p != MAP_FAILED ||
|
|
||||||
!"mmap returned an unexpected error");
|
|
||||||
after = read_rss();
|
|
||||||
assert(llabs(after - before - length) < 0x40000 ||
|
|
||||||
!"rss didn't grow as expected");
|
|
||||||
if (!unmap)
|
|
||||||
return;
|
|
||||||
ret = munmap(p, length);
|
|
||||||
assert(!ret || !"munmap returned an unexpected error");
|
|
||||||
after = read_rss();
|
|
||||||
assert(llabs(after - before) < 0x40000 ||
|
|
||||||
!"rss didn't shrink as expected");
|
|
||||||
}
|
|
||||||
|
|
||||||
static int open_file(const char *path)
|
|
||||||
{
|
|
||||||
int fd, err;
|
|
||||||
|
|
||||||
unlink(path);
|
|
||||||
fd = open(path, O_CREAT | O_RDWR | O_TRUNC | O_EXCL
|
|
||||||
| O_LARGEFILE | O_CLOEXEC, 0600);
|
|
||||||
assert(fd > 2);
|
|
||||||
unlink(path);
|
|
||||||
err = ftruncate(fd, length);
|
|
||||||
assert(!err);
|
|
||||||
return fd;
|
|
||||||
}
|
|
||||||
|
|
||||||
int main(void)
|
|
||||||
{
|
|
||||||
int hugefd, fd;
|
|
||||||
|
|
||||||
fd = open_file("/dev/shm/hugetlbhog");
|
|
||||||
hugefd = open_file("/hugepages/hugetlbhog");
|
|
||||||
|
|
||||||
system("echo 100 > /proc/sys/vm/nr_hugepages");
|
|
||||||
do_mmap(-1, MAP_ANONYMOUS, 1);
|
|
||||||
do_mmap(fd, 0, 1);
|
|
||||||
do_mmap(-1, MAP_ANONYMOUS | MAP_HUGETLB, 1);
|
|
||||||
do_mmap(hugefd, 0, 1);
|
|
||||||
do_mmap(hugefd, MAP_HUGETLB, 1);
|
|
||||||
/* Leak the last one to test do_exit() */
|
|
||||||
do_mmap(-1, MAP_ANONYMOUS | MAP_HUGETLB, 0);
|
|
||||||
printf("oll korrekt.\n");
|
|
||||||
return 0;
|
|
||||||
}
|
|
|
@ -75,16 +75,9 @@ else
|
||||||
echo "[PASS]"
|
echo "[PASS]"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "--------------------"
|
echo "NOTE: The above hugetlb tests provide minimal coverage. Use"
|
||||||
echo "running hugetlbfstest"
|
echo " https://github.com/libhugetlbfs/libhugetlbfs.git for"
|
||||||
echo "--------------------"
|
echo " hugetlb regression testing."
|
||||||
./hugetlbfstest
|
|
||||||
if [ $? -ne 0 ]; then
|
|
||||||
echo "[FAIL]"
|
|
||||||
exitcode=1
|
|
||||||
else
|
|
||||||
echo "[PASS]"
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "--------------------"
|
echo "--------------------"
|
||||||
echo "running userfaultfd"
|
echo "running userfaultfd"
|
||||||
|
|
|
@ -147,7 +147,8 @@ static void *locking_thread(void *arg)
|
||||||
if (sizeof(page_nr) > sizeof(rand_nr)) {
|
if (sizeof(page_nr) > sizeof(rand_nr)) {
|
||||||
if (random_r(&rand, &rand_nr))
|
if (random_r(&rand, &rand_nr))
|
||||||
fprintf(stderr, "random_r 2 error\n"), exit(1);
|
fprintf(stderr, "random_r 2 error\n"), exit(1);
|
||||||
page_nr |= ((unsigned long) rand_nr) << 32;
|
page_nr |= (((unsigned long) rand_nr) << 16) <<
|
||||||
|
16;
|
||||||
}
|
}
|
||||||
} else
|
} else
|
||||||
page_nr += 1;
|
page_nr += 1;
|
||||||
|
@ -290,7 +291,8 @@ static void *uffd_poll_thread(void *arg)
|
||||||
msg.event), exit(1);
|
msg.event), exit(1);
|
||||||
if (msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
|
if (msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
|
||||||
fprintf(stderr, "unexpected write fault\n"), exit(1);
|
fprintf(stderr, "unexpected write fault\n"), exit(1);
|
||||||
offset = (char *)msg.arg.pagefault.address - area_dst;
|
offset = (char *)(unsigned long)msg.arg.pagefault.address -
|
||||||
|
area_dst;
|
||||||
offset &= ~(page_size-1);
|
offset &= ~(page_size-1);
|
||||||
if (copy_page(offset))
|
if (copy_page(offset))
|
||||||
userfaults++;
|
userfaults++;
|
||||||
|
@ -327,7 +329,8 @@ static void *uffd_read_thread(void *arg)
|
||||||
if (bounces & BOUNCE_VERIFY &&
|
if (bounces & BOUNCE_VERIFY &&
|
||||||
msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
|
msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
|
||||||
fprintf(stderr, "unexpected write fault\n"), exit(1);
|
fprintf(stderr, "unexpected write fault\n"), exit(1);
|
||||||
offset = (char *)msg.arg.pagefault.address - area_dst;
|
offset = (char *)(unsigned long)msg.arg.pagefault.address -
|
||||||
|
area_dst;
|
||||||
offset &= ~(page_size-1);
|
offset &= ~(page_size-1);
|
||||||
if (copy_page(offset))
|
if (copy_page(offset))
|
||||||
(*this_cpu_userfaults)++;
|
(*this_cpu_userfaults)++;
|
||||||
|
|
|
@ -57,23 +57,15 @@
|
||||||
* pagemap kernel ABI bits
|
* pagemap kernel ABI bits
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#define PM_ENTRY_BYTES sizeof(uint64_t)
|
#define PM_ENTRY_BYTES 8
|
||||||
#define PM_STATUS_BITS 3
|
#define PM_PFRAME_BITS 55
|
||||||
#define PM_STATUS_OFFSET (64 - PM_STATUS_BITS)
|
#define PM_PFRAME_MASK ((1LL << PM_PFRAME_BITS) - 1)
|
||||||
#define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET)
|
#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK)
|
||||||
#define PM_STATUS(nr) (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK)
|
#define PM_SOFT_DIRTY (1ULL << 55)
|
||||||
#define PM_PSHIFT_BITS 6
|
#define PM_MMAP_EXCLUSIVE (1ULL << 56)
|
||||||
#define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS)
|
#define PM_FILE (1ULL << 61)
|
||||||
#define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET)
|
#define PM_SWAP (1ULL << 62)
|
||||||
#define __PM_PSHIFT(x) (((uint64_t) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
|
#define PM_PRESENT (1ULL << 63)
|
||||||
#define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1)
|
|
||||||
#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK)
|
|
||||||
|
|
||||||
#define __PM_SOFT_DIRTY (1LL)
|
|
||||||
#define PM_PRESENT PM_STATUS(4LL)
|
|
||||||
#define PM_SWAP PM_STATUS(2LL)
|
|
||||||
#define PM_SOFT_DIRTY __PM_PSHIFT(__PM_SOFT_DIRTY)
|
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* kernel page flags
|
* kernel page flags
|
||||||
|
@ -100,6 +92,8 @@
|
||||||
#define KPF_SLOB_FREE 49
|
#define KPF_SLOB_FREE 49
|
||||||
#define KPF_SLUB_FROZEN 50
|
#define KPF_SLUB_FROZEN 50
|
||||||
#define KPF_SLUB_DEBUG 51
|
#define KPF_SLUB_DEBUG 51
|
||||||
|
#define KPF_FILE 62
|
||||||
|
#define KPF_MMAP_EXCLUSIVE 63
|
||||||
|
|
||||||
#define KPF_ALL_BITS ((uint64_t)~0ULL)
|
#define KPF_ALL_BITS ((uint64_t)~0ULL)
|
||||||
#define KPF_HACKERS_BITS (0xffffULL << 32)
|
#define KPF_HACKERS_BITS (0xffffULL << 32)
|
||||||
|
@ -149,6 +143,9 @@ static const char * const page_flag_names[] = {
|
||||||
[KPF_SLOB_FREE] = "P:slob_free",
|
[KPF_SLOB_FREE] = "P:slob_free",
|
||||||
[KPF_SLUB_FROZEN] = "A:slub_frozen",
|
[KPF_SLUB_FROZEN] = "A:slub_frozen",
|
||||||
[KPF_SLUB_DEBUG] = "E:slub_debug",
|
[KPF_SLUB_DEBUG] = "E:slub_debug",
|
||||||
|
|
||||||
|
[KPF_FILE] = "F:file",
|
||||||
|
[KPF_MMAP_EXCLUSIVE] = "1:mmap_exclusive",
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
@ -452,6 +449,10 @@ static uint64_t expand_overloaded_flags(uint64_t flags, uint64_t pme)
|
||||||
|
|
||||||
if (pme & PM_SOFT_DIRTY)
|
if (pme & PM_SOFT_DIRTY)
|
||||||
flags |= BIT(SOFTDIRTY);
|
flags |= BIT(SOFTDIRTY);
|
||||||
|
if (pme & PM_FILE)
|
||||||
|
flags |= BIT(FILE);
|
||||||
|
if (pme & PM_MMAP_EXCLUSIVE)
|
||||||
|
flags |= BIT(MMAP_EXCLUSIVE);
|
||||||
|
|
||||||
return flags;
|
return flags;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue