alistair23-linux/include/linux/mempolicy.h
Linus Torvalds 3d59eebc5e Automatic NUMA Balancing V11
-----BEGIN PGP SIGNATURE-----
 Version: GnuPG v2.0.18 (GNU/Linux)
 
 iQIcBAABAgAGBQJQx0kQAAoJEHzG/DNEskfi4fQP/R5PRovayroZALBMLnVJDaLD
 Ttr9p40VNXbiJ+MfRgatJjSSJZ4Jl+fC3NEqBhcwVZhckZZb9R2s0WtrSQo5+ZbB
 vdRfiuKoCaKM4cSZ08C12uTvsF6xjhjd27CTUlMkyOcDoKxMEFKelv0hocSxe4Wo
 xqlv3eF+VsY7kE1BNbgBP06SX4tDpIHRxXfqJPMHaSKQmre+cU0xG2GcEu3QGbHT
 DEDTI788YSaWLmBfMC+kWoaQl1+bV/FYvavIAS8/o4K9IKvgR42VzrXmaFaqrbgb
 72ksa6xfAi57yTmZHqyGmts06qYeBbPpKI+yIhCMInxA9CY3lPbvHppRf0RQOyzj
 YOi4hovGEMJKE+BCILukhJcZ9jCTtS3zut6v1rdvR88f4y7uhR9RfmRfsxuW7PNj
 3Rmh191+n0lVWDmhOs2psXuCLJr3LEiA0dFffN1z8REUTtTAZMsj8Rz+SvBNAZDR
 hsJhERVeXB6X5uQ5rkLDzbn1Zic60LjVw7LIp6SF2OYf/YKaF8vhyWOA8dyCEu8W
 CGo7AoG0BO8tIIr8+LvFe8CweypysZImx4AjCfIs4u9pu/v11zmBvO9NO5yfuObF
 BreEERYgTes/UITxn1qdIW4/q+Nr0iKO3CTqsmu6L1GfCz3/XzPGs3U26fUhllqi
 Ka0JKgnWvsa6ez6FSzKI
 =ivQa
 -----END PGP SIGNATURE-----

Merge tag 'balancenuma-v11' of git://git.kernel.org/pub/scm/linux/kernel/git/mel/linux-balancenuma

Pull Automatic NUMA Balancing bare-bones from Mel Gorman:
 "There are three implementations for NUMA balancing, this tree
  (balancenuma), numacore which has been developed in tip/master and
  autonuma which is in aa.git.

  In almost all respects balancenuma is the dumbest of the three because
  its main impact is on the VM side with no attempt to be smart about
  scheduling.  In the interest of getting the ball rolling, it would be
  desirable to see this much merged for 3.8 with the view to building
  scheduler smarts on top and adapting the VM where required for 3.9.

  The most recent set of comparisons available from different people are

    mel:    https://lkml.org/lkml/2012/12/9/108
    mingo:  https://lkml.org/lkml/2012/12/7/331
    tglx:   https://lkml.org/lkml/2012/12/10/437
    srikar: https://lkml.org/lkml/2012/12/10/397

  The results are a mixed bag.  In my own tests, balancenuma does
  reasonably well.  It's dumb as rocks and does not regress against
  mainline.  On the other hand, Ingo's tests shows that balancenuma is
  incapable of converging for this workloads driven by perf which is bad
  but is potentially explained by the lack of scheduler smarts.  Thomas'
  results show balancenuma improves on mainline but falls far short of
  numacore or autonuma.  Srikar's results indicate we all suffer on a
  large machine with imbalanced node sizes.

  My own testing showed that recent numacore results have improved
  dramatically, particularly in the last week but not universally.
  We've butted heads heavily on system CPU usage and high levels of
  migration even when it shows that overall performance is better.
  There are also cases where it regresses.  Of interest is that for
  specjbb in some configurations it will regress for lower numbers of
  warehouses and show gains for higher numbers which is not reported by
  the tool by default and sometimes missed in treports.  Recently I
  reported for numacore that the JVM was crashing with
  NullPointerExceptions but currently it's unclear what the source of
  this problem is.  Initially I thought it was in how numacore batch
  handles PTEs but I'm no longer think this is the case.  It's possible
  numacore is just able to trigger it due to higher rates of migration.

  These reports were quite late in the cycle so I/we would like to start
  with this tree as it contains much of the code we can agree on and has
  not changed significantly over the last 2-3 weeks."

* tag 'balancenuma-v11' of git://git.kernel.org/pub/scm/linux/kernel/git/mel/linux-balancenuma: (50 commits)
  mm/rmap, migration: Make rmap_walk_anon() and try_to_unmap_anon() more scalable
  mm/rmap: Convert the struct anon_vma::mutex to an rwsem
  mm: migrate: Account a transhuge page properly when rate limiting
  mm: numa: Account for failed allocations and isolations as migration failures
  mm: numa: Add THP migration for the NUMA working set scanning fault case build fix
  mm: numa: Add THP migration for the NUMA working set scanning fault case.
  mm: sched: numa: Delay PTE scanning until a task is scheduled on a new node
  mm: sched: numa: Control enabling and disabling of NUMA balancing if !SCHED_DEBUG
  mm: sched: numa: Control enabling and disabling of NUMA balancing
  mm: sched: Adapt the scanning rate if a NUMA hinting fault does not migrate
  mm: numa: Use a two-stage filter to restrict pages being migrated for unlikely task<->node relationships
  mm: numa: migrate: Set last_nid on newly allocated page
  mm: numa: split_huge_page: Transfer last_nid on tail page
  mm: numa: Introduce last_nid to the page frame
  sched: numa: Slowly increase the scanning period as NUMA faults are handled
  mm: numa: Rate limit setting of pte_numa if node is saturated
  mm: numa: Rate limit the amount of memory that is migrated between nodes
  mm: numa: Structures for Migrate On Fault per NUMA migration rate limiting
  mm: numa: Migrate pages handled during a pmd_numa hinting fault
  mm: numa: Migrate on reference policy
  ...
2012-12-16 15:18:08 -08:00

320 lines
7.6 KiB
C

/*
* NUMA memory policies for Linux.
* Copyright 2003,2004 Andi Kleen SuSE Labs
*/
#ifndef _LINUX_MEMPOLICY_H
#define _LINUX_MEMPOLICY_H 1
#include <linux/mmzone.h>
#include <linux/slab.h>
#include <linux/rbtree.h>
#include <linux/spinlock.h>
#include <linux/nodemask.h>
#include <linux/pagemap.h>
#include <uapi/linux/mempolicy.h>
struct mm_struct;
#ifdef CONFIG_NUMA
/*
* Describe a memory policy.
*
* A mempolicy can be either associated with a process or with a VMA.
* For VMA related allocations the VMA policy is preferred, otherwise
* the process policy is used. Interrupts ignore the memory policy
* of the current process.
*
* Locking policy for interlave:
* In process context there is no locking because only the process accesses
* its own state. All vma manipulation is somewhat protected by a down_read on
* mmap_sem.
*
* Freeing policy:
* Mempolicy objects are reference counted. A mempolicy will be freed when
* mpol_put() decrements the reference count to zero.
*
* Duplicating policy objects:
* mpol_dup() allocates a new mempolicy and copies the specified mempolicy
* to the new storage. The reference count of the new object is initialized
* to 1, representing the caller of mpol_dup().
*/
struct mempolicy {
atomic_t refcnt;
unsigned short mode; /* See MPOL_* above */
unsigned short flags; /* See set_mempolicy() MPOL_F_* above */
union {
short preferred_node; /* preferred */
nodemask_t nodes; /* interleave/bind */
/* undefined for default */
} v;
union {
nodemask_t cpuset_mems_allowed; /* relative to these nodes */
nodemask_t user_nodemask; /* nodemask passed by user */
} w;
};
/*
* Support for managing mempolicy data objects (clone, copy, destroy)
* The default fast path of a NULL MPOL_DEFAULT policy is always inlined.
*/
extern void __mpol_put(struct mempolicy *pol);
static inline void mpol_put(struct mempolicy *pol)
{
if (pol)
__mpol_put(pol);
}
/*
* Does mempolicy pol need explicit unref after use?
* Currently only needed for shared policies.
*/
static inline int mpol_needs_cond_ref(struct mempolicy *pol)
{
return (pol && (pol->flags & MPOL_F_SHARED));
}
static inline void mpol_cond_put(struct mempolicy *pol)
{
if (mpol_needs_cond_ref(pol))
__mpol_put(pol);
}
extern struct mempolicy *__mpol_dup(struct mempolicy *pol);
static inline struct mempolicy *mpol_dup(struct mempolicy *pol)
{
if (pol)
pol = __mpol_dup(pol);
return pol;
}
#define vma_policy(vma) ((vma)->vm_policy)
#define vma_set_policy(vma, pol) ((vma)->vm_policy = (pol))
static inline void mpol_get(struct mempolicy *pol)
{
if (pol)
atomic_inc(&pol->refcnt);
}
extern bool __mpol_equal(struct mempolicy *a, struct mempolicy *b);
static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b)
{
if (a == b)
return true;
return __mpol_equal(a, b);
}
/*
* Tree of shared policies for a shared memory region.
* Maintain the policies in a pseudo mm that contains vmas. The vmas
* carry the policy. As a special twist the pseudo mm is indexed in pages, not
* bytes, so that we can work with shared memory segments bigger than
* unsigned long.
*/
struct sp_node {
struct rb_node nd;
unsigned long start, end;
struct mempolicy *policy;
};
struct shared_policy {
struct rb_root root;
struct mutex mutex;
};
void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol);
int mpol_set_shared_policy(struct shared_policy *info,
struct vm_area_struct *vma,
struct mempolicy *new);
void mpol_free_shared_policy(struct shared_policy *p);
struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
unsigned long idx);
struct mempolicy *get_vma_policy(struct task_struct *tsk,
struct vm_area_struct *vma, unsigned long addr);
extern void numa_default_policy(void);
extern void numa_policy_init(void);
extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
enum mpol_rebind_step step);
extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new);
extern void mpol_fix_fork_child_flag(struct task_struct *p);
extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
unsigned long addr, gfp_t gfp_flags,
struct mempolicy **mpol, nodemask_t **nodemask);
extern bool init_nodemask_of_mempolicy(nodemask_t *mask);
extern bool mempolicy_nodemask_intersects(struct task_struct *tsk,
const nodemask_t *mask);
extern unsigned slab_node(void);
extern enum zone_type policy_zone;
static inline void check_highest_zone(enum zone_type k)
{
if (k > policy_zone && k != ZONE_MOVABLE)
policy_zone = k;
}
int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
const nodemask_t *to, int flags);
#ifdef CONFIG_TMPFS
extern int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context);
#endif
extern int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol,
int no_context);
/* Check if a vma is migratable */
static inline int vma_migratable(struct vm_area_struct *vma)
{
if (vma->vm_flags & (VM_IO | VM_HUGETLB | VM_PFNMAP))
return 0;
/*
* Migration allocates pages in the highest zone. If we cannot
* do so then migration (at least from node to node) is not
* possible.
*/
if (vma->vm_file &&
gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
< policy_zone)
return 0;
return 1;
}
extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long);
#else
struct mempolicy {};
static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b)
{
return true;
}
static inline void mpol_put(struct mempolicy *p)
{
}
static inline void mpol_cond_put(struct mempolicy *pol)
{
}
static inline void mpol_get(struct mempolicy *pol)
{
}
static inline struct mempolicy *mpol_dup(struct mempolicy *old)
{
return NULL;
}
struct shared_policy {};
static inline int mpol_set_shared_policy(struct shared_policy *info,
struct vm_area_struct *vma,
struct mempolicy *new)
{
return -EINVAL;
}
static inline void mpol_shared_policy_init(struct shared_policy *sp,
struct mempolicy *mpol)
{
}
static inline void mpol_free_shared_policy(struct shared_policy *p)
{
}
static inline struct mempolicy *
mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
{
return NULL;
}
#define vma_policy(vma) NULL
#define vma_set_policy(vma, pol) do {} while(0)
static inline void numa_policy_init(void)
{
}
static inline void numa_default_policy(void)
{
}
static inline void mpol_rebind_task(struct task_struct *tsk,
const nodemask_t *new,
enum mpol_rebind_step step)
{
}
static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
{
}
static inline void mpol_fix_fork_child_flag(struct task_struct *p)
{
}
static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
unsigned long addr, gfp_t gfp_flags,
struct mempolicy **mpol, nodemask_t **nodemask)
{
*mpol = NULL;
*nodemask = NULL;
return node_zonelist(0, gfp_flags);
}
static inline bool init_nodemask_of_mempolicy(nodemask_t *m)
{
return false;
}
static inline bool mempolicy_nodemask_intersects(struct task_struct *tsk,
const nodemask_t *mask)
{
return false;
}
static inline int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
const nodemask_t *to, int flags)
{
return 0;
}
static inline void check_highest_zone(int k)
{
}
#ifdef CONFIG_TMPFS
static inline int mpol_parse_str(char *str, struct mempolicy **mpol,
int no_context)
{
return 1; /* error */
}
#endif
static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol,
int no_context)
{
return 0;
}
static inline int mpol_misplaced(struct page *page, struct vm_area_struct *vma,
unsigned long address)
{
return -1; /* no node preference */
}
#endif /* CONFIG_NUMA */
#endif