mm/demotion: build demotion targets based on explicit memory tiers
This patch switch the demotion target building logic to use memory tiers instead of NUMA distance. All N_MEMORY NUMA nodes will be placed in the default memory tier and additional memory tiers will be added by drivers like dax kmem. This patch builds the demotion target for a NUMA node by looking at all memory tiers below the tier to which the NUMA node belongs. The closest node in the immediately following memory tier is used as a demotion target. Since we are now only building demotion target for N_MEMORY NUMA nodes the CPU hotplug calls are removed in this patch. Link: https://lkml.kernel.org/r/20220818131042.113280-6-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> Reviewed-by: "Huang, Ying" <ying.huang@intel.com> Acked-by: Wei Xu <weixugc@google.com> Cc: Alistair Popple <apopple@nvidia.com> Cc: Bharata B Rao <bharata@amd.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: Hesham Almatary <hesham.almatary@huawei.com> Cc: Jagdish Gediya <jvgediya.oss@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com> Cc: Michal Hocko <mhocko@kernel.org> Cc: Tim Chen <tim.c.chen@intel.com> Cc: Yang Shi <shy828301@gmail.com> Cc: SeongJae Park <sj@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
parent
7b88bda376
commit
6c542ab757
5 changed files with 239 additions and 423 deletions
|
|
@ -6,6 +6,8 @@
|
|||
#include <linux/memory.h>
|
||||
#include <linux/memory-tiers.h>
|
||||
|
||||
#include "internal.h"
|
||||
|
||||
struct memory_tier {
|
||||
/* hierarchy of memory tiers */
|
||||
struct list_head list;
|
||||
|
|
@ -19,6 +21,10 @@ struct memory_tier {
|
|||
int adistance_start;
|
||||
};
|
||||
|
||||
struct demotion_nodes {
|
||||
nodemask_t preferred;
|
||||
};
|
||||
|
||||
struct node_memory_type_map {
|
||||
struct memory_dev_type *memtype;
|
||||
int map_count;
|
||||
|
|
@ -28,6 +34,66 @@ static DEFINE_MUTEX(memory_tier_lock);
|
|||
static LIST_HEAD(memory_tiers);
|
||||
static struct node_memory_type_map node_memory_types[MAX_NUMNODES];
|
||||
static struct memory_dev_type *default_dram_type;
|
||||
#ifdef CONFIG_MIGRATION
|
||||
/*
|
||||
* node_demotion[] examples:
|
||||
*
|
||||
* Example 1:
|
||||
*
|
||||
* Node 0 & 1 are CPU + DRAM nodes, node 2 & 3 are PMEM nodes.
|
||||
*
|
||||
* node distances:
|
||||
* node 0 1 2 3
|
||||
* 0 10 20 30 40
|
||||
* 1 20 10 40 30
|
||||
* 2 30 40 10 40
|
||||
* 3 40 30 40 10
|
||||
*
|
||||
* memory_tiers0 = 0-1
|
||||
* memory_tiers1 = 2-3
|
||||
*
|
||||
* node_demotion[0].preferred = 2
|
||||
* node_demotion[1].preferred = 3
|
||||
* node_demotion[2].preferred = <empty>
|
||||
* node_demotion[3].preferred = <empty>
|
||||
*
|
||||
* Example 2:
|
||||
*
|
||||
* Node 0 & 1 are CPU + DRAM nodes, node 2 is memory-only DRAM node.
|
||||
*
|
||||
* node distances:
|
||||
* node 0 1 2
|
||||
* 0 10 20 30
|
||||
* 1 20 10 30
|
||||
* 2 30 30 10
|
||||
*
|
||||
* memory_tiers0 = 0-2
|
||||
*
|
||||
* node_demotion[0].preferred = <empty>
|
||||
* node_demotion[1].preferred = <empty>
|
||||
* node_demotion[2].preferred = <empty>
|
||||
*
|
||||
* Example 3:
|
||||
*
|
||||
* Node 0 is CPU + DRAM nodes, Node 1 is HBM node, node 2 is PMEM node.
|
||||
*
|
||||
* node distances:
|
||||
* node 0 1 2
|
||||
* 0 10 20 30
|
||||
* 1 20 10 40
|
||||
* 2 30 40 10
|
||||
*
|
||||
* memory_tiers0 = 1
|
||||
* memory_tiers1 = 0
|
||||
* memory_tiers2 = 2
|
||||
*
|
||||
* node_demotion[0].preferred = 2
|
||||
* node_demotion[1].preferred = 0
|
||||
* node_demotion[2].preferred = <empty>
|
||||
*
|
||||
*/
|
||||
static struct demotion_nodes *node_demotion __read_mostly;
|
||||
#endif /* CONFIG_MIGRATION */
|
||||
|
||||
static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memtype)
|
||||
{
|
||||
|
|
@ -73,6 +139,154 @@ static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memty
|
|||
return new_memtier;
|
||||
}
|
||||
|
||||
static struct memory_tier *__node_get_memory_tier(int node)
|
||||
{
|
||||
struct memory_dev_type *memtype;
|
||||
|
||||
memtype = node_memory_types[node];
|
||||
if (memtype && node_isset(node, memtype->nodes))
|
||||
return memtype->memtier;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MIGRATION
|
||||
/**
|
||||
* next_demotion_node() - Get the next node in the demotion path
|
||||
* @node: The starting node to lookup the next node
|
||||
*
|
||||
* Return: node id for next memory node in the demotion path hierarchy
|
||||
* from @node; NUMA_NO_NODE if @node is terminal. This does not keep
|
||||
* @node online or guarantee that it *continues* to be the next demotion
|
||||
* target.
|
||||
*/
|
||||
int next_demotion_node(int node)
|
||||
{
|
||||
struct demotion_nodes *nd;
|
||||
int target;
|
||||
|
||||
if (!node_demotion)
|
||||
return NUMA_NO_NODE;
|
||||
|
||||
nd = &node_demotion[node];
|
||||
|
||||
/*
|
||||
* node_demotion[] is updated without excluding this
|
||||
* function from running.
|
||||
*
|
||||
* Make sure to use RCU over entire code blocks if
|
||||
* node_demotion[] reads need to be consistent.
|
||||
*/
|
||||
rcu_read_lock();
|
||||
/*
|
||||
* If there are multiple target nodes, just select one
|
||||
* target node randomly.
|
||||
*
|
||||
* In addition, we can also use round-robin to select
|
||||
* target node, but we should introduce another variable
|
||||
* for node_demotion[] to record last selected target node,
|
||||
* that may cause cache ping-pong due to the changing of
|
||||
* last target node. Or introducing per-cpu data to avoid
|
||||
* caching issue, which seems more complicated. So selecting
|
||||
* target node randomly seems better until now.
|
||||
*/
|
||||
target = node_random(&nd->preferred);
|
||||
rcu_read_unlock();
|
||||
|
||||
return target;
|
||||
}
|
||||
|
||||
static void disable_all_demotion_targets(void)
|
||||
{
|
||||
int node;
|
||||
|
||||
for_each_node_state(node, N_MEMORY)
|
||||
node_demotion[node].preferred = NODE_MASK_NONE;
|
||||
/*
|
||||
* Ensure that the "disable" is visible across the system.
|
||||
* Readers will see either a combination of before+disable
|
||||
* state or disable+after. They will never see before and
|
||||
* after state together.
|
||||
*/
|
||||
synchronize_rcu();
|
||||
}
|
||||
|
||||
static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier)
|
||||
{
|
||||
nodemask_t nodes = NODE_MASK_NONE;
|
||||
struct memory_dev_type *memtype;
|
||||
|
||||
list_for_each_entry(memtype, &memtier->memory_types, tier_sibiling)
|
||||
nodes_or(nodes, nodes, memtype->nodes);
|
||||
|
||||
return nodes;
|
||||
}
|
||||
|
||||
/*
|
||||
* Find an automatic demotion target for all memory
|
||||
* nodes. Failing here is OK. It might just indicate
|
||||
* being at the end of a chain.
|
||||
*/
|
||||
static void establish_demotion_targets(void)
|
||||
{
|
||||
struct memory_tier *memtier;
|
||||
struct demotion_nodes *nd;
|
||||
int target = NUMA_NO_NODE, node;
|
||||
int distance, best_distance;
|
||||
nodemask_t tier_nodes;
|
||||
|
||||
lockdep_assert_held_once(&memory_tier_lock);
|
||||
|
||||
if (!node_demotion || !IS_ENABLED(CONFIG_MIGRATION))
|
||||
return;
|
||||
|
||||
disable_all_demotion_targets();
|
||||
|
||||
for_each_node_state(node, N_MEMORY) {
|
||||
best_distance = -1;
|
||||
nd = &node_demotion[node];
|
||||
|
||||
memtier = __node_get_memory_tier(node);
|
||||
if (!memtier || list_is_last(&memtier->list, &memory_tiers))
|
||||
continue;
|
||||
/*
|
||||
* Get the lower memtier to find the demotion node list.
|
||||
*/
|
||||
memtier = list_next_entry(memtier, list);
|
||||
tier_nodes = get_memtier_nodemask(memtier);
|
||||
/*
|
||||
* find_next_best_node, use 'used' nodemask as a skip list.
|
||||
* Add all memory nodes except the selected memory tier
|
||||
* nodelist to skip list so that we find the best node from the
|
||||
* memtier nodelist.
|
||||
*/
|
||||
nodes_andnot(tier_nodes, node_states[N_MEMORY], tier_nodes);
|
||||
|
||||
/*
|
||||
* Find all the nodes in the memory tier node list of same best distance.
|
||||
* add them to the preferred mask. We randomly select between nodes
|
||||
* in the preferred mask when allocating pages during demotion.
|
||||
*/
|
||||
do {
|
||||
target = find_next_best_node(node, &tier_nodes);
|
||||
if (target == NUMA_NO_NODE)
|
||||
break;
|
||||
|
||||
distance = node_distance(node, target);
|
||||
if (distance == best_distance || best_distance == -1) {
|
||||
best_distance = distance;
|
||||
node_set(target, nd->preferred);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
} while (1);
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
static inline void disable_all_demotion_targets(void) {}
|
||||
static inline void establish_demotion_targets(void) {}
|
||||
#endif /* CONFIG_MIGRATION */
|
||||
|
||||
static inline void __init_node_memory_type(int node, struct memory_dev_type *memtype)
|
||||
{
|
||||
if (!node_memory_types[node].memtype)
|
||||
|
|
@ -109,16 +323,6 @@ static struct memory_tier *set_node_memory_tier(int node)
|
|||
return memtier;
|
||||
}
|
||||
|
||||
static struct memory_tier *__node_get_memory_tier(int node)
|
||||
{
|
||||
struct memory_dev_type *memtype;
|
||||
|
||||
memtype = node_memory_types[node];
|
||||
if (memtype && node_isset(node, memtype->nodes))
|
||||
return memtype->memtier;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void destroy_memory_tier(struct memory_tier *memtier)
|
||||
{
|
||||
list_del(&memtier->list);
|
||||
|
|
@ -207,6 +411,7 @@ EXPORT_SYMBOL_GPL(clear_node_memory_type);
|
|||
static int __meminit memtier_hotplug_callback(struct notifier_block *self,
|
||||
unsigned long action, void *_arg)
|
||||
{
|
||||
struct memory_tier *memtier;
|
||||
struct memory_notify *arg = _arg;
|
||||
|
||||
/*
|
||||
|
|
@ -219,12 +424,15 @@ static int __meminit memtier_hotplug_callback(struct notifier_block *self,
|
|||
switch (action) {
|
||||
case MEM_OFFLINE:
|
||||
mutex_lock(&memory_tier_lock);
|
||||
clear_node_memory_tier(arg->status_change_nid);
|
||||
if (clear_node_memory_tier(arg->status_change_nid))
|
||||
establish_demotion_targets();
|
||||
mutex_unlock(&memory_tier_lock);
|
||||
break;
|
||||
case MEM_ONLINE:
|
||||
mutex_lock(&memory_tier_lock);
|
||||
set_node_memory_tier(arg->status_change_nid);
|
||||
memtier = set_node_memory_tier(arg->status_change_nid);
|
||||
if (!IS_ERR(memtier))
|
||||
establish_demotion_targets();
|
||||
mutex_unlock(&memory_tier_lock);
|
||||
break;
|
||||
}
|
||||
|
|
@ -237,6 +445,11 @@ static int __init memory_tier_init(void)
|
|||
int node;
|
||||
struct memory_tier *memtier;
|
||||
|
||||
#ifdef CONFIG_MIGRATION
|
||||
node_demotion = kcalloc(nr_node_ids, sizeof(struct demotion_nodes),
|
||||
GFP_KERNEL);
|
||||
WARN_ON(!node_demotion);
|
||||
#endif
|
||||
mutex_lock(&memory_tier_lock);
|
||||
/*
|
||||
* For now we can have 4 faster memory tiers with smaller adistance
|
||||
|
|
@ -259,6 +472,7 @@ static int __init memory_tier_init(void)
|
|||
*/
|
||||
break;
|
||||
}
|
||||
establish_demotion_targets();
|
||||
mutex_unlock(&memory_tier_lock);
|
||||
|
||||
hotplug_memory_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRIO);
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue