|
|
|
#ifndef _LINUX_MEMPOLICY_H
|
|
|
|
#define _LINUX_MEMPOLICY_H 1
|
|
|
|
|
|
|
|
#include <linux/errno.h>
|
|
|
|
|
|
|
|
/*
|
|
|
|
* NUMA memory policies for Linux.
|
|
|
|
* Copyright 2003,2004 Andi Kleen SuSE Labs
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* Policies */
|
|
|
|
#define MPOL_DEFAULT 0
|
|
|
|
#define MPOL_PREFERRED 1
|
|
|
|
#define MPOL_BIND 2
|
|
|
|
#define MPOL_INTERLEAVE 3
|
|
|
|
|
|
|
|
#define MPOL_MAX MPOL_INTERLEAVE
|
|
|
|
|
|
|
|
/* Flags for get_mem_policy */
|
|
|
|
#define MPOL_F_NODE (1<<0) /* return next IL mode instead of node mask */
|
|
|
|
#define MPOL_F_ADDR (1<<1) /* look up vma using address */
|
|
|
|
|
|
|
|
/* Flags for mbind */
|
|
|
|
#define MPOL_MF_STRICT (1<<0) /* Verify existing pages in the mapping */
|
|
|
|
#define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform to mapping */
|
|
|
|
#define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to mapping */
|
|
|
|
#define MPOL_MF_INTERNAL (1<<3) /* Internal flags start here */
|
|
|
|
|
|
|
|
#ifdef __KERNEL__
|
|
|
|
|
|
|
|
#include <linux/config.h>
|
|
|
|
#include <linux/mmzone.h>
|
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/rbtree.h>
|
|
|
|
#include <linux/spinlock.h>
|
|
|
|
#include <linux/nodemask.h>
|
|
|
|
|
|
|
|
struct vm_area_struct;
|
|
|
|
|
|
|
|
#ifdef CONFIG_NUMA
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Describe a memory policy.
|
|
|
|
*
|
|
|
|
* A mempolicy can be either associated with a process or with a VMA.
|
|
|
|
* For VMA related allocations the VMA policy is preferred, otherwise
|
|
|
|
* the process policy is used. Interrupts ignore the memory policy
|
|
|
|
* of the current process.
|
|
|
|
*
|
|
|
|
* Locking policy for interlave:
|
|
|
|
* In process context there is no locking because only the process accesses
|
|
|
|
* its own state. All vma manipulation is somewhat protected by a down_read on
|
|
|
|
* mmap_sem.
|
|
|
|
*
|
|
|
|
* Freeing policy:
|
|
|
|
* When policy is MPOL_BIND v.zonelist is kmalloc'ed and must be kfree'd.
|
|
|
|
* All other policies don't have any external state. mpol_free() handles this.
|
|
|
|
*
|
|
|
|
* Copying policy objects:
|
|
|
|
* For MPOL_BIND the zonelist must be always duplicated. mpol_clone() does this.
|
|
|
|
*/
|
|
|
|
struct mempolicy {
|
|
|
|
atomic_t refcnt;
|
|
|
|
short policy; /* See MPOL_* above */
|
|
|
|
union {
|
|
|
|
struct zonelist *zonelist; /* bind */
|
|
|
|
short preferred_node; /* preferred */
|
|
|
|
nodemask_t nodes; /* interleave */
|
|
|
|
/* undefined for default */
|
|
|
|
} v;
|
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Support for managing mempolicy data objects (clone, copy, destroy)
|
|
|
|
* The default fast path of a NULL MPOL_DEFAULT policy is always inlined.
|
|
|
|
*/
|
|
|
|
|
|
|
|
extern void __mpol_free(struct mempolicy *pol);
|
|
|
|
static inline void mpol_free(struct mempolicy *pol)
|
|
|
|
{
|
|
|
|
if (pol)
|
|
|
|
__mpol_free(pol);
|
|
|
|
}
|
|
|
|
|
|
|
|
extern struct mempolicy *__mpol_copy(struct mempolicy *pol);
|
|
|
|
static inline struct mempolicy *mpol_copy(struct mempolicy *pol)
|
|
|
|
{
|
|
|
|
if (pol)
|
|
|
|
pol = __mpol_copy(pol);
|
|
|
|
return pol;
|
|
|
|
}
|
|
|
|
|
|
|
|
#define vma_policy(vma) ((vma)->vm_policy)
|
|
|
|
#define vma_set_policy(vma, pol) ((vma)->vm_policy = (pol))
|
|
|
|
|
|
|
|
static inline void mpol_get(struct mempolicy *pol)
|
|
|
|
{
|
|
|
|
if (pol)
|
|
|
|
atomic_inc(&pol->refcnt);
|
|
|
|
}
|
|
|
|
|
|
|
|
extern int __mpol_equal(struct mempolicy *a, struct mempolicy *b);
|
|
|
|
static inline int mpol_equal(struct mempolicy *a, struct mempolicy *b)
|
|
|
|
{
|
|
|
|
if (a == b)
|
|
|
|
return 1;
|
|
|
|
return __mpol_equal(a, b);
|
|
|
|
}
|
|
|
|
#define vma_mpol_equal(a,b) mpol_equal(vma_policy(a), vma_policy(b))
|
|
|
|
|
|
|
|
/* Could later add inheritance of the process policy here. */
|
|
|
|
|
|
|
|
#define mpol_set_vma_default(vma) ((vma)->vm_policy = NULL)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Tree of shared policies for a shared memory region.
|
|
|
|
* Maintain the policies in a pseudo mm that contains vmas. The vmas
|
|
|
|
* carry the policy. As a special twist the pseudo mm is indexed in pages, not
|
|
|
|
* bytes, so that we can work with shared memory segments bigger than
|
|
|
|
* unsigned long.
|
|
|
|
*/
|
|
|
|
|
|
|
|
struct sp_node {
|
|
|
|
struct rb_node nd;
|
|
|
|
unsigned long start, end;
|
|
|
|
struct mempolicy *policy;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct shared_policy {
|
|
|
|
struct rb_root root;
|
|
|
|
spinlock_t lock;
|
|
|
|
};
|
|
|
|
|
|
|
|
static inline void mpol_shared_policy_init(struct shared_policy *info)
|
|
|
|
{
|
|
|
|
info->root = RB_ROOT;
|
|
|
|
spin_lock_init(&info->lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
int mpol_set_shared_policy(struct shared_policy *info,
|
|
|
|
struct vm_area_struct *vma,
|
|
|
|
struct mempolicy *new);
|
|
|
|
void mpol_free_shared_policy(struct shared_policy *p);
|
|
|
|
struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
|
|
|
|
unsigned long idx);
|
|
|
|
|
|
|
|
struct mempolicy *get_vma_policy(struct task_struct *task,
|
|
|
|
struct vm_area_struct *vma, unsigned long addr);
|
|
|
|
|
|
|
|
extern void numa_default_policy(void);
|
|
|
|
extern void numa_policy_init(void);
|
[PATCH] cpusets: automatic numa mempolicy rebinding
This patch automatically updates a tasks NUMA mempolicy when its cpuset
memory placement changes. It does so within the context of the task,
without any need to support low level external mempolicy manipulation.
If a system is not using cpusets, or if running on a system with just the
root (all-encompassing) cpuset, then this remap is a no-op. Only when a
task is moved between cpusets, or a cpusets memory placement is changed
does the following apply. Otherwise, the main routine below,
rebind_policy() is not even called.
When mixing cpusets, scheduler affinity, and NUMA mempolicies, the
essential role of cpusets is to place jobs (several related tasks) on a set
of CPUs and Memory Nodes, the essential role of sched_setaffinity is to
manage a jobs processor placement within its allowed cpuset, and the
essential role of NUMA mempolicy (mbind, set_mempolicy) is to manage a jobs
memory placement within its allowed cpuset.
However, CPU affinity and NUMA memory placement are managed within the
kernel using absolute system wide numbering, not cpuset relative numbering.
This is ok until a job is migrated to a different cpuset, or what's the
same, a jobs cpuset is moved to different CPUs and Memory Nodes.
Then the CPU affinity and NUMA memory placement of the tasks in the job
need to be updated, to preserve their cpuset-relative position. This can
be done for CPU affinity using sched_setaffinity() from user code, as one
task can modify anothers CPU affinity. This cannot be done from an
external task for NUMA memory placement, as that can only be modified in
the context of the task using it.
However, it easy enough to remap a tasks NUMA mempolicy automatically when
a task is migrated, using the existing cpuset mechanism to trigger a
refresh of a tasks memory placement after its cpuset has changed. All that
is needed is the old and new nodemask, and notice to the task that it needs
to rebind its mempolicy. The tasks mems_allowed has the old mask, the
tasks cpuset has the new mask, and the existing
cpuset_update_current_mems_allowed() mechanism provides the notice. The
bitmap/cpumask/nodemask remap operators provide the cpuset relative
calculations.
This patch leaves open a couple of issues:
1) Updating vma and shmfs/tmpfs/hugetlbfs memory policies:
These mempolicies may reference nodes outside of those allowed to
the current task by its cpuset. Tasks are migrated as part of jobs,
which reside on what might be several cpusets in a subtree. When such
a job is migrated, all NUMA memory policy references to nodes within
that cpuset subtree should be translated, and references to any nodes
outside that subtree should be left untouched. A future patch will
provide the cpuset mechanism needed to mark such subtrees. With that
patch, we will be able to correctly migrate these other memory policies
across a job migration.
2) Updating cpuset, affinity and memory policies in user space:
This is harder. Any placement state stored in user space using
system-wide numbering will be invalidated across a migration. More
work will be required to provide user code with a migration-safe means
to manage its cpuset relative placement, while preserving the current
API's that pass system wide numbers, not cpuset relative numbers across
the kernel-user boundary.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
19 years ago
|
|
|
extern void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new);
|
|
|
|
extern struct mempolicy default_policy;
|
|
|
|
extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
|
|
|
|
unsigned long addr);
|
|
|
|
|
|
|
|
extern int policy_zone;
|
|
|
|
|
|
|
|
static inline void check_highest_zone(int k)
|
|
|
|
{
|
|
|
|
if (k > policy_zone)
|
|
|
|
policy_zone = k;
|
|
|
|
}
|
|
|
|
|
|
|
|
int do_migrate_pages(struct mm_struct *mm,
|
|
|
|
const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags);
|
|
|
|
|
|
|
|
#else
|
|
|
|
|
|
|
|
struct mempolicy {};
|
|
|
|
|
|
|
|
static inline int mpol_equal(struct mempolicy *a, struct mempolicy *b)
|
|
|
|
{
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
#define vma_mpol_equal(a,b) 1
|
|
|
|
|
|
|
|
#define mpol_set_vma_default(vma) do {} while(0)
|
|
|
|
|
|
|
|
static inline void mpol_free(struct mempolicy *p)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void mpol_get(struct mempolicy *pol)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline struct mempolicy *mpol_copy(struct mempolicy *old)
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct shared_policy {};
|
|
|
|
|
|
|
|
static inline int mpol_set_shared_policy(struct shared_policy *info,
|
|
|
|
struct vm_area_struct *vma,
|
|
|
|
struct mempolicy *new)
|
|
|
|
{
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void mpol_shared_policy_init(struct shared_policy *info)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void mpol_free_shared_policy(struct shared_policy *p)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline struct mempolicy *
|
|
|
|
mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
#define vma_policy(vma) NULL
|
|
|
|
#define vma_set_policy(vma, pol) do {} while(0)
|
|
|
|
|
|
|
|
static inline void numa_policy_init(void)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void numa_default_policy(void)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
[PATCH] cpusets: automatic numa mempolicy rebinding
This patch automatically updates a tasks NUMA mempolicy when its cpuset
memory placement changes. It does so within the context of the task,
without any need to support low level external mempolicy manipulation.
If a system is not using cpusets, or if running on a system with just the
root (all-encompassing) cpuset, then this remap is a no-op. Only when a
task is moved between cpusets, or a cpusets memory placement is changed
does the following apply. Otherwise, the main routine below,
rebind_policy() is not even called.
When mixing cpusets, scheduler affinity, and NUMA mempolicies, the
essential role of cpusets is to place jobs (several related tasks) on a set
of CPUs and Memory Nodes, the essential role of sched_setaffinity is to
manage a jobs processor placement within its allowed cpuset, and the
essential role of NUMA mempolicy (mbind, set_mempolicy) is to manage a jobs
memory placement within its allowed cpuset.
However, CPU affinity and NUMA memory placement are managed within the
kernel using absolute system wide numbering, not cpuset relative numbering.
This is ok until a job is migrated to a different cpuset, or what's the
same, a jobs cpuset is moved to different CPUs and Memory Nodes.
Then the CPU affinity and NUMA memory placement of the tasks in the job
need to be updated, to preserve their cpuset-relative position. This can
be done for CPU affinity using sched_setaffinity() from user code, as one
task can modify anothers CPU affinity. This cannot be done from an
external task for NUMA memory placement, as that can only be modified in
the context of the task using it.
However, it easy enough to remap a tasks NUMA mempolicy automatically when
a task is migrated, using the existing cpuset mechanism to trigger a
refresh of a tasks memory placement after its cpuset has changed. All that
is needed is the old and new nodemask, and notice to the task that it needs
to rebind its mempolicy. The tasks mems_allowed has the old mask, the
tasks cpuset has the new mask, and the existing
cpuset_update_current_mems_allowed() mechanism provides the notice. The
bitmap/cpumask/nodemask remap operators provide the cpuset relative
calculations.
This patch leaves open a couple of issues:
1) Updating vma and shmfs/tmpfs/hugetlbfs memory policies:
These mempolicies may reference nodes outside of those allowed to
the current task by its cpuset. Tasks are migrated as part of jobs,
which reside on what might be several cpusets in a subtree. When such
a job is migrated, all NUMA memory policy references to nodes within
that cpuset subtree should be translated, and references to any nodes
outside that subtree should be left untouched. A future patch will
provide the cpuset mechanism needed to mark such subtrees. With that
patch, we will be able to correctly migrate these other memory policies
across a job migration.
2) Updating cpuset, affinity and memory policies in user space:
This is harder. Any placement state stored in user space using
system-wide numbering will be invalidated across a migration. More
work will be required to provide user code with a migration-safe means
to manage its cpuset relative placement, while preserving the current
API's that pass system wide numbers, not cpuset relative numbers across
the kernel-user boundary.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
19 years ago
|
|
|
static inline void numa_policy_rebind(const nodemask_t *old,
|
|
|
|
const nodemask_t *new)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
|
|
|
|
unsigned long addr)
|
|
|
|
{
|
|
|
|
return NODE_DATA(0)->node_zonelists + gfp_zone(GFP_HIGHUSER);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void check_highest_zone(int k)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
#endif /* CONFIG_NUMA */
|
|
|
|
#endif /* __KERNEL__ */
|
|
|
|
|
|
|
|
#endif
|