2
* Generic VM initialization for x86-64 NUMA setups.
3
* Copyright 2002,2003 Andi Kleen, SuSE Labs.
4
* Adapted for Xen: Ryan Harper <ryanh@us.ibm.com>
8
#include <xen/string.h>
10
#include <xen/ctype.h>
11
#include <xen/nodemask.h>
13
#include <xen/keyhandler.h>
17
#include <xen/sched.h>
19
static int numa_setup(char *s);
20
custom_param("numa", numa_setup);
27
#define round_up(x,y) ((((x)+(y))-1) & (~((y)-1)))
29
struct node_data node_data[MAX_NUMNODES];
31
/* Mapping from pdx to node id */
33
static typeof(*memnodemap) _memnodemap[64];
34
unsigned long memnodemapsize;
37
unsigned char cpu_to_node[NR_CPUS] __read_mostly = {
38
[0 ... NR_CPUS-1] = NUMA_NO_NODE
41
* Keep BIOS's CPU2node information, should not be used for memory allocaion
43
unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
44
[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
46
cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
48
nodemask_t __read_mostly node_online_map = { { [0] = 1UL } };
50
int numa_off __devinitdata = 0;
52
int acpi_numa __devinitdata;
55
* Given a shift value, try to populate memnodemap[]
58
* 0 if memnodmap[] too small (of shift too small)
59
* -1 if node overlap or lost ram (shift too big)
61
static int __init populate_memnodemap(const struct node *nodes,
62
int numnodes, int shift, int *nodeids)
64
unsigned long spdx, epdx;
67
memset(memnodemap, NUMA_NO_NODE, memnodemapsize * sizeof(*memnodemap));
68
for (i = 0; i < numnodes; i++) {
69
spdx = paddr_to_pdx(nodes[i].start);
70
epdx = paddr_to_pdx(nodes[i].end - 1) + 1;
73
if ((epdx >> shift) >= memnodemapsize)
76
if (memnodemap[spdx >> shift] != NUMA_NO_NODE)
80
memnodemap[spdx >> shift] = i;
82
memnodemap[spdx >> shift] = nodeids[i];
84
spdx += (1UL << shift);
85
} while (spdx < epdx);
91
static int __init allocate_cachealigned_memnodemap(void)
94
unsigned long size = PFN_UP(memnodemapsize * sizeof(*memnodemap));
95
unsigned long mfn = alloc_boot_pages(size, 1);
99
"NUMA: Unable to allocate Memory to Node hash map\n");
104
memnodemap = mfn_to_virt(mfn);
107
printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
109
memnodemapsize = size / sizeof(*memnodemap);
114
"Memory to Node hash needs %lu entries, got only %zu\n",
115
memnodemapsize, ARRAY_SIZE(_memnodemap));
122
* The LSB of all start and end addresses in the node map is the value of the
123
* maximum possible shift.
125
static int __init extract_lsb_from_nodes(const struct node *nodes,
128
int i, nodes_used = 0;
129
unsigned long spdx, epdx;
130
unsigned long bitfield = 0, memtop = 0;
132
for (i = 0; i < numnodes; i++) {
133
spdx = paddr_to_pdx(nodes[i].start);
134
epdx = paddr_to_pdx(nodes[i].end - 1) + 1;
143
i = BITS_PER_LONG - 1;
145
i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
146
memnodemapsize = (memtop >> i) + 1;
150
int __init compute_hash_shift(struct node *nodes, int numnodes,
155
shift = extract_lsb_from_nodes(nodes, numnodes);
156
if (memnodemapsize <= ARRAY_SIZE(_memnodemap))
157
memnodemap = _memnodemap;
158
else if (allocate_cachealigned_memnodemap())
160
printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
163
if (populate_memnodemap(nodes, numnodes, shift, nodeids) != 1) {
164
printk(KERN_INFO "Your memory is not aligned you need to "
165
"rebuild your kernel with a bigger NODEMAPSIZE "
166
"shift=%d\n", shift);
171
/* initialize NODE_DATA given nodeid and start/end */
172
void __init setup_node_bootmem(int nodeid, u64 start, u64 end)
174
unsigned long start_pfn, end_pfn;
176
start_pfn = start >> PAGE_SHIFT;
177
end_pfn = end >> PAGE_SHIFT;
179
NODE_DATA(nodeid)->node_id = nodeid;
180
NODE_DATA(nodeid)->node_start_pfn = start_pfn;
181
NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
183
node_set_online(nodeid);
186
void __init numa_init_array(void)
189
/* There are unfortunately some poorly designed mainboards around
190
that only connect memory to a single CPU. This breaks the 1:1 cpu->node
191
mapping. To avoid this fill in the mapping for all possible
192
CPUs, as the number of CPUs is not known yet.
193
We round robin the existing nodes. */
194
rr = first_node(node_online_map);
195
for (i = 0; i < NR_CPUS; i++) {
196
if (cpu_to_node[i] != NUMA_NO_NODE)
198
numa_set_node(i, rr);
199
rr = next_node(rr, node_online_map);
200
if (rr == MAX_NUMNODES)
201
rr = first_node(node_online_map);
206
#ifdef CONFIG_NUMA_EMU
207
static int numa_fake __initdata = 0;
210
static int numa_emulation(u64 start_pfn, u64 end_pfn)
213
struct node nodes[MAX_NUMNODES];
214
u64 sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
216
/* Kludge needed for the hash function */
217
if (hweight64(sz) > 1) {
219
while ((x << 1) < sz)
222
printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n");
226
memset(&nodes,0,sizeof(nodes));
227
for (i = 0; i < numa_fake; i++) {
228
nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
229
if (i == numa_fake-1)
230
sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
231
nodes[i].end = nodes[i].start + sz;
232
printk(KERN_INFO "Faking node %d at %"PRIx64"-%"PRIx64" (%"PRIu64"MB)\n",
234
nodes[i].start, nodes[i].end,
235
(nodes[i].end - nodes[i].start) >> 20);
238
memnode_shift = compute_hash_shift(nodes, numa_fake, NULL);
239
if (memnode_shift < 0) {
241
printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
244
for_each_online_node(i)
245
setup_node_bootmem(i, nodes[i].start, nodes[i].end);
251
void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
255
#ifdef CONFIG_NUMA_EMU
256
if (numa_fake && !numa_emulation(start_pfn, end_pfn))
260
#ifdef CONFIG_ACPI_NUMA
261
if (!numa_off && !acpi_scan_nodes((u64)start_pfn << PAGE_SHIFT,
262
(u64)end_pfn << PAGE_SHIFT))
266
printk(KERN_INFO "%s\n",
267
numa_off ? "NUMA turned off" : "No NUMA configuration found");
269
printk(KERN_INFO "Faking a node at %016"PRIx64"-%016"PRIx64"\n",
270
(u64)start_pfn << PAGE_SHIFT,
271
(u64)end_pfn << PAGE_SHIFT);
272
/* setup dummy node covering all memory */
273
memnode_shift = BITS_PER_LONG - 1;
274
memnodemap = _memnodemap;
275
nodes_clear(node_online_map);
277
for (i = 0; i < NR_CPUS; i++)
279
node_to_cpumask[0] = cpumask_of_cpu(0);
280
setup_node_bootmem(0, (u64)start_pfn << PAGE_SHIFT, (u64)end_pfn << PAGE_SHIFT);
283
__cpuinit void numa_add_cpu(int cpu)
285
cpu_set(cpu, node_to_cpumask[cpu_to_node(cpu)]);
288
void __cpuinit numa_set_node(int cpu, int node)
290
cpu_to_node[cpu] = node;
294
static __init int numa_setup(char *opt)
296
if (!strncmp(opt,"off",3))
298
if (!strncmp(opt,"on",2))
300
#ifdef CONFIG_NUMA_EMU
301
if(!strncmp(opt, "fake=", 5)) {
303
numa_fake = simple_strtoul(opt+5,NULL,0); ;
304
if (numa_fake >= MAX_NUMNODES)
305
numa_fake = MAX_NUMNODES;
308
#ifdef CONFIG_ACPI_NUMA
309
if (!strncmp(opt,"noacpi",6)) {
318
* Setup early cpu_to_node.
320
* Populate cpu_to_node[] only if x86_cpu_to_apicid[],
321
* and apicid_to_node[] tables have valid entries for a CPU.
322
* This means we skip cpu_to_node[] initialisation for NUMA
323
* emulation and faking node case (when running a kernel compiled
324
* for NUMA on a non NUMA box), which is OK as cpu_to_node[]
325
* is already initialized in a round robin manner at numa_init_array,
326
* prior to this call, and this initialization is good enough
327
* for the fake NUMA cases.
329
void __devinit init_cpu_to_node(void)
332
for (i = 0; i < NR_CPUS; i++) {
333
u32 apicid = x86_cpu_to_apicid[i];
334
if (apicid == BAD_APICID)
336
node = apicid_to_node[apicid];
337
if ( node == NUMA_NO_NODE || !node_online(node) )
339
numa_set_node(i, node);
343
EXPORT_SYMBOL(cpu_to_node);
344
EXPORT_SYMBOL(node_to_cpumask);
345
EXPORT_SYMBOL(memnode_shift);
346
EXPORT_SYMBOL(memnodemap);
347
EXPORT_SYMBOL(node_data);
349
static void dump_numa(unsigned char key)
351
s_time_t now = NOW();
354
struct page_info *page;
355
unsigned int page_num_node[MAX_NUMNODES];
357
printk("'%c' pressed -> dumping numa info (now-0x%X:%08X)\n", key,
358
(u32)(now>>32), (u32)now);
360
for_each_online_node(i) {
361
paddr_t pa = (paddr_t)(NODE_DATA(i)->node_start_pfn + 1)<< PAGE_SHIFT;
362
printk("idx%d -> NODE%d start->%lu size->%lu\n",
363
i, NODE_DATA(i)->node_id,
364
NODE_DATA(i)->node_start_pfn,
365
NODE_DATA(i)->node_spanned_pages);
366
/* sanity check phys_to_nid() */
367
printk("phys_to_nid(%"PRIpaddr") -> %d should be %d\n", pa, phys_to_nid(pa),
368
NODE_DATA(i)->node_id);
370
for_each_online_cpu(i)
371
printk("CPU%d -> NODE%d\n", i, cpu_to_node[i]);
373
rcu_read_lock(&domlist_read_lock);
375
printk("Memory location of each domain:\n");
378
printk("Domain %u (total: %u):\n", d->domain_id, d->tot_pages);
380
for_each_online_node(i)
381
page_num_node[i] = 0;
383
page_list_for_each(page, &d->page_list)
385
i = phys_to_nid((paddr_t)page_to_mfn(page) << PAGE_SHIFT);
389
for_each_online_node(i)
390
printk(" Node %u: %u\n", i, page_num_node[i]);
393
rcu_read_unlock(&domlist_read_lock);
396
static struct keyhandler dump_numa_keyhandler = {
399
.desc = "dump numa info"
402
static __init int register_numa_trigger(void)
404
register_keyhandler('u', &dump_numa_keyhandler);
407
__initcall(register_numa_trigger);