1
/* Simple NUMA library.
2
Copyright (C) 2003,2004 Andi Kleen, SuSE Labs.
4
libnuma is free software; you can redistribute it and/or
5
modify it under the terms of the GNU Lesser General Public
6
License as published by the Free Software Foundation; version
9
libnuma is distributed in the hope that it will be useful,
10
but WITHOUT ANY WARRANTY; without even the implied warranty of
11
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12
Lesser General Public License for more details.
14
You should find a copy of v2.1 of the GNU Lesser General Public License
15
somewhere on your Linux system; if not, write to the Free Software
16
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18
All calls are undefined when numa_available returns an error. */
38
#define WEAK __attribute__((weak))
40
#define CPU_BUFFER_SIZE 4096 /* This limits you to 32768 CPUs */
42
nodemask_t numa_no_nodes;
43
nodemask_t numa_all_nodes;
45
#if __GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 3)
46
#warning "not threadsafe"
50
static __thread int bind_policy = MPOL_BIND;
51
static __thread int mbind_flags = 0;
53
int numa_exit_on_error = 0;
55
make_internal_alias(numa_exit_on_error);
67
/* Next two can be overwritten by the application for different error handling */
68
WEAK void numa_error(char *where)
72
if (numa_exit_on_error_int)
77
make_internal_alias(numa_error);
79
WEAK void numa_warn(int num, char *fmt, ...)
81
static unsigned warned;
85
/* Give each warning only once */
86
if ((1<<num) & warned)
91
fprintf(stderr, "libnuma: Warning: ");
92
vfprintf(stderr, fmt, ap);
99
make_internal_alias(numa_warn);
102
static void setpol(int policy, nodemask_t mask)
104
if (set_mempolicy_int(policy, &mask.n[0], NUMA_NUM_NODES+1) < 0)
105
numa_error_int("set_mempolicy");
108
static void getpol(int *oldpolicy, nodemask_t *oldmask)
110
if (get_mempolicy_int(oldpolicy, oldmask->n, NUMA_NUM_NODES+1, 0, 0) < 0)
111
numa_error_int("get_mempolicy");
114
static void dombind(void *mem, size_t size, int pol, nodemask_t *nodes)
116
if (mbind_int(mem, size, pol, nodes->n, nodes ? NUMA_NUM_NODES+1 : 0, mbind_flags)
118
numa_error_int("mbind");
122
/* gives the wrong answer for hugetlbfs mappings. */
123
int numa_pagesize(void)
128
pagesize = getpagesize();
132
make_internal_alias(numa_pagesize);
134
static int maxnode = -1;
135
static int maxcpus = -1;
137
static int number_of_cpus(void)
148
f = fopen("/proc/cpuinfo","r");
151
unsigned long buffer[CPU_WORDS(8192)];
152
memset(buffer, 0, sizeof(buffer));
153
n = numa_sched_getaffinity_int(getpid(), sizeof(buffer), buffer);
156
for (i = 0; i < n / sizeof(long); i++) {
159
for (k = 0; k< 8; k++)
160
if (buffer[i] & (1<<k))
165
numa_warn_int(W_noproc, "/proc not mounted. Assuming zero nodes: %s",
170
while (getdelim(&line, &len, '\n', f) > 0) {
171
if (strncmp(line,"processor",9))
173
s = line + strcspn(line, "0123456789");
174
if (sscanf(s, "%d", &cpu) == 1 && cpu > maxcpus)
182
static int fallback_max_node(void)
184
numa_warn_int(W_nosysfs, "/sys not mounted or no numa system. Assuming one node per CPU: %s",
186
maxnode = number_of_cpus();
190
int numa_max_node(void)
196
/* No hotplug yet. */
199
d = opendir("/sys/devices/system/node");
201
return fallback_max_node();
203
while ((de = readdir(d)) != NULL) {
205
if (strncmp(de->d_name, "node", 4))
208
nd = strtoul(de->d_name+4, NULL, 0);
214
return fallback_max_node();
218
make_internal_alias(numa_max_node);
220
/* (cache the result?) */
221
long long numa_node_size64(int node, long long *freep)
229
int required = freep ? 2 : 1;
233
sprintf(fn,"/sys/devices/system/node/node%d/meminfo", node);
237
while (getdelim(&line, &len, '\n', f) > 0) {
239
char *s = strcasestr(line, "kB");
243
while (s > line && isspace(*s))
245
while (s > line && isdigit(*s))
247
if (strstr(line, "MemTotal")) {
248
size = strtoull(s,&end,0) << 10;
254
if (freep && strstr(line, "MemFree")) {
255
*freep = strtoull(s,&end,0) << 10;
265
numa_warn_int(W_badmeminfo, "Cannot parse sysfs meminfo (%d)", ok);
269
make_internal_alias(numa_node_size64);
271
long numa_node_size(int node, long *freep)
274
long sz = numa_node_size64_int(node, &f2);
280
int numa_available(void)
283
if (get_mempolicy_int(NULL, NULL, 0, 0, 0) < 0 && errno == ENOSYS)
285
max = numa_max_node_int();
286
for (i = 0; i <= max; i++)
287
nodemask_set(&numa_all_nodes, i);
291
void numa_interleave_memory(void *mem, size_t size, nodemask_t *mask)
293
dombind(mem, size, MPOL_INTERLEAVE, mask);
296
void numa_tonode_memory(void *mem, size_t size, int node)
299
nodemask_zero(&nodes);
300
nodemask_set(&nodes, node);
301
dombind(mem, size, bind_policy, &nodes);
304
void numa_tonodemask_memory(void *mem, size_t size, nodemask_t *mask)
306
dombind(mem, size, bind_policy, mask);
309
void numa_setlocal_memory(void *mem, size_t size)
311
dombind(mem, size, MPOL_PREFERRED, NULL);
314
void numa_police_memory(void *mem, size_t size)
316
int pagesize = numa_pagesize_int();
318
for (i = 0; i < size; i += pagesize)
319
asm volatile("" :: "r" (((volatile unsigned char *)mem)[i]));
322
make_internal_alias(numa_police_memory);
324
void *numa_alloc(size_t size)
327
mem = mmap(0, size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS,
329
if (mem == (char *)-1)
331
numa_police_memory_int(mem, size);
335
void *numa_alloc_interleaved_subset(size_t size, nodemask_t *mask)
339
mem = mmap(0, size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS,
341
if (mem == (char *)-1)
343
dombind(mem, size, MPOL_INTERLEAVE, mask);
347
make_internal_alias(numa_alloc_interleaved_subset);
349
void *numa_alloc_interleaved(size_t size)
351
return numa_alloc_interleaved_subset_int(size, &numa_all_nodes);
354
void numa_set_interleave_mask(nodemask_t *mask)
356
if (nodemask_equal(mask, &numa_no_nodes))
357
setpol(MPOL_DEFAULT, *mask);
359
setpol(MPOL_INTERLEAVE, *mask);
362
nodemask_t numa_get_interleave_mask(void)
366
getpol(&oldpolicy, &mask);
367
if (oldpolicy == MPOL_INTERLEAVE)
369
return numa_no_nodes;
373
int numa_get_interleave_node(void)
376
if (get_mempolicy_int(&nd, NULL, 0, 0, MPOL_F_NODE) == 0)
381
void *numa_alloc_onnode(size_t size, int node)
385
nodemask_zero(&nodes);
386
nodemask_set(&nodes, node);
387
mem = mmap(0, size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS,
389
if (mem == (char *)-1)
391
dombind(mem, size, bind_policy, &nodes);
395
void *numa_alloc_local(size_t size)
398
mem = mmap(0, size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS,
400
if (mem == (char *)-1)
402
dombind(mem, size, MPOL_DEFAULT, NULL);
406
void numa_set_bind_policy(int strict)
409
bind_policy = MPOL_BIND;
411
bind_policy = MPOL_PREFERRED;
414
void numa_set_membind(nodemask_t *mask)
416
setpol(MPOL_BIND, *mask);
419
make_internal_alias(numa_set_membind);
421
nodemask_t numa_get_membind(void)
425
getpol(&oldpolicy, &nodes);
426
if (oldpolicy == MPOL_BIND)
428
return numa_all_nodes;
431
void numa_free(void *mem, size_t size)
436
static unsigned long *node_cpu_mask[NUMA_NUM_NODES];
438
/* This would be better with some locking, but I don't want to make libnuma
439
dependent on pthreads right now. The races are relatively harmless. */
440
int numa_node_to_cpus(int node, unsigned long *buffer, int bufferlen)
449
unsigned *mask, prev;
450
int ncpus = number_of_cpus();
452
buflen_needed = CPU_BYTES(ncpus);
453
if ((unsigned)node > maxnode || bufferlen < buflen_needed) {
458
if (node_cpu_mask[node]) {
459
if (bufferlen > buflen_needed)
460
memset(buffer, 0, bufferlen);
461
memcpy(buffer, node_cpu_mask[node], buflen_needed);
465
mask = malloc(buflen_needed);
467
mask = (unsigned *)buffer;
468
memset(mask, 0, buflen_needed);
470
sprintf(fn, "/sys/devices/system/node/node%d/cpumap", node);
472
if (!f || getdelim(&line, &len, '\n', f) < 1) {
473
numa_warn_int(W_nosysfs2,
474
"/sys not mounted or invalid. Assuming nodes equal CPU: %s",
476
set_bit(node, (unsigned long *)mask);
486
for (i = 0; s[i] && s[i] != ','; i++) {
487
static const char hexdigits[] = "0123456789abcdef";
488
char *w = strchr(hexdigits, tolower(s[i]));
492
numa_warn_int(W_cpumap,
493
"Unexpected character `%c' in sysfs cpumap", s[i]);
497
num = (num*16) + (w - hexdigits);
504
/* skip leading zeros */
505
if (num == 0 && prev == 0)
508
memmove(mask + 1, mask, buflen_needed - sizeof(unsigned));
514
memcpy(buffer, mask, buflen_needed);
516
/* slightly racy, see above */
517
if (node_cpu_mask[node]) {
518
if (mask != (unsigned *)buffer)
521
node_cpu_mask[node] = (unsigned long *)mask;
526
make_internal_alias(numa_node_to_cpus);
528
int numa_run_on_node_mask(nodemask_t *mask)
530
int ncpus = number_of_cpus();
532
unsigned long cpus[CPU_WORDS(ncpus)], nodecpus[CPU_WORDS(ncpus)];
533
memset(cpus, 0, CPU_BYTES(ncpus));
534
for (i = 0; i < NUMA_NUM_NODES; i++) {
535
if (mask->n[i / BITS_PER_LONG] == 0)
537
if (nodemask_isset(mask, i)) {
538
if (numa_node_to_cpus_int(i, nodecpus, CPU_BYTES(ncpus)) < 0) {
539
numa_warn_int(W_noderunmask,
540
"Cannot read node cpumask from sysfs");
543
for (k = 0; k < CPU_WORDS(ncpus); k++)
544
cpus[k] |= nodecpus[k];
547
err = numa_sched_setaffinity_int(getpid(), CPU_BYTES(ncpus), cpus);
549
/* The sched_setaffinity API is broken because it expects
550
the user to guess the kernel cpuset size. Do this in a
552
if (err < 0 && errno == EINVAL) {
553
int savederrno = errno;
556
static int size = -1;
558
size = CPU_BYTES(ncpus) * 2;
559
bigbuf = malloc(CPU_BUFFER_SIZE);
565
while (size <= CPU_BUFFER_SIZE) {
566
memcpy(bigbuf, cpus, CPU_BYTES(ncpus));
567
memset(bigbuf + CPU_BYTES(ncpus), 0,
568
CPU_BUFFER_SIZE - CPU_BYTES(ncpus));
569
err = numa_sched_setaffinity_int(me, size, (unsigned long *)bigbuf);
570
if (err == 0 || errno != EINVAL)
581
make_internal_alias(numa_run_on_node_mask);
583
nodemask_t numa_get_run_node_mask(void)
585
int ncpus = number_of_cpus();
588
unsigned long cpus[CPU_WORDS(ncpus)], nodecpus[CPU_WORDS(ncpus)];
590
memset(cpus, 0, CPU_BYTES(ncpus));
591
nodemask_zero(&mask);
592
if (numa_sched_getaffinity_int(getpid(), CPU_BYTES(ncpus), cpus) < 0)
593
return numa_no_nodes;
594
/* somewhat dumb algorithm */
595
for (i = 0; i < NUMA_NUM_NODES; i++) {
596
if (numa_node_to_cpus_int(i, nodecpus, CPU_BYTES(ncpus)) < 0) {
597
numa_warn_int(W_noderunmask, "Cannot read node cpumask from sysfs");
600
for (k = 0; k < NUMA_NUM_NODES/BITS_PER_LONG; k++) {
601
if (nodecpus[k] & cpus[k])
602
nodemask_set(&mask, i);
608
int numa_run_on_node(int node)
610
int ncpus = number_of_cpus();
611
unsigned long cpus[CPU_WORDS(ncpus)];
614
memset(cpus, 0xff, CPU_BYTES(ncpus));
615
else if (node < NUMA_NUM_NODES) {
616
if (numa_node_to_cpus_int(node, cpus, CPU_BYTES(ncpus)) < 0) {
617
numa_warn_int(W_noderunmask, "Cannot read node cpumask from sysfs");
624
return numa_sched_setaffinity_int(getpid(), CPU_BYTES(ncpus), cpus);
627
int numa_preferred(void)
631
getpol(&policy, &nodes);
632
if (policy == MPOL_PREFERRED || policy == MPOL_BIND) {
634
int max = NUMA_NUM_NODES;
635
for (i = 0; i < max ; i++)
636
if (nodemask_isset(&nodes, i))
639
/* could read the current CPU from /proc/self/status. Probably
641
return 0; /* or random one? */
644
void numa_set_preferred(int node)
649
nodemask_zero(&empty);
650
setpol(MPOL_DEFAULT, empty);
654
nodemask_set(&n, node);
655
setpol(MPOL_PREFERRED, n);
658
void numa_set_localalloc(void)
661
nodemask_zero(&empty);
662
setpol(MPOL_PREFERRED, empty);
665
void numa_bind(nodemask_t *nodemask)
667
numa_run_on_node_mask_int(nodemask);
668
numa_set_membind_int(nodemask);
671
void numa_set_strict(int flag)
674
mbind_flags |= MPOL_MF_STRICT;
676
mbind_flags &= ~MPOL_MF_STRICT;