1
// $Id: x86copy.gcc,v 1.14 2003-02-06 18:49:36 manoj Exp $
2
// memory copy implementation derived from examples 3 and 4 at
3
// http://www.sgi.com/developers/library/resources/asc_cpu.html
5
// void *armci_asm_memcpy(void *dst, const void *src, size_t n, int id);
6
// id={0/1} - is used to support up to 2 threads
8
// we turn the assembly memcpy on/off based on checking of the CPU type
16
#define CPUFAMILY 0x00000F00
17
#define MODEL 0x000000F0
19
#define MMXSUPPORT 0x00800000
20
#define SSESUPPORT 0x02000000
21
#define IA64 0x70000000
22
static char vendor[13];
23
typedef enum {unkown=0,p5, p2, p2m, p3, p4, pro, p6, K5, K6, K7, ia64} CPUTYPE;
26
#define EX4 // use sgi's example_4_cpy if n is greater than 2k
27
// use memcpy and example_3_memcpy for the rest
28
//#define EX3 // use memcpy and example_3_memcpy
30
static char tbuf0[2048];
31
static char tbuf1[2048];
33
static int use_asm_copy=-1;
36
void *asm_memcpy3(void *dst, const void *src, size_t n)
55
movntq %%mm3, 16(%1);\
56
movntq %%mm4, 24(%1);\
57
movntq %%mm5, 32(%1);\
58
movntq %%mm6, 40(%1);\
59
movntq %%mm7, 48(%1);\
60
movntq %%mm0, 56(%1);\
68
: "r"(src), "r"(dst), "c"(n)
76
void *asm_memcpy4(void *dst, const void *src, size_t n, int bufid)
94
prefetchnta 64(%%esi); \
95
prefetchnta 96(%%esi); \
97
movq 0(%%esi), %%mm1; \
98
movq 8(%%esi), %%mm2; \
99
movq 16(%%esi), %%mm3; \
100
movq 24(%%esi), %%mm4; \
101
movq 32(%%esi), %%mm5; \
102
movq 40(%%esi), %%mm6; \
103
movq 48(%%esi), %%mm7; \
104
movq 56(%%esi), %%mm0; \
108
movq %%mm3, 16(%4); \
109
movq %%mm4, 24(%4); \
110
movq %%mm5, 32(%4); \
111
movq %%mm6, 40(%4); \
112
movq %%mm7, 48(%4); \
113
movq %%mm0, 56(%4); \
128
movq 16(%4), %%mm3; \
130
movq 32(%4), %%mm5; \
131
movq 40(%4), %%mm6; \
132
movq 48(%4), %%mm7; \
133
movq 56(%4), %%mm0; \
135
movntq %%mm1, 0(%%edi); \
136
movntq %%mm2, 8(%%edi); \
137
movntq %%mm3, 16(%%edi); \
138
movntq %%mm4, 24(%%edi); \
139
movntq %%mm5, 32(%%edi); \
140
movntq %%mm6, 40(%%edi); \
141
movntq %%mm7, 48(%%edi); \
142
movntq %%mm0, 56(%%edi); \
154
: "S"(src), "D"(dst), "0"(n), "r"(ptbuf)
191
:"%eax", "%edx", "%edi"
195
"movl $vendor, %%esi;"
199
"movl %%ebx, 0(%%esi);"
200
"movl %%edx, 4(%%esi);"
201
"movl %%ecx, 8(%%esi);"
205
: "%ecx", "%edx", "%esi", "%edi"
209
printf("eax=%x\n", reax);
210
printf("edx=%x\n", redx);
211
printf("vendor = %s\n", vendor);
214
family = (CPUFAMILY & reax) >> 8;
215
model = (MODEL & reax) >> 4;
217
mmxsupport = (redx & MMXSUPPORT);
218
ssesupport = (redx & SSESUPPORT);
219
isIA64 = (redx & IA64);
222
printf("mmx support = %s\n", mmxsupport ? "yes" : "no");
223
printf("SSE support = %s\n", ssesupport ? "yes" : "no");
226
if(strcmp(vendor, "GenuineIntel") == 0){
236
if(model == 1) return pro;
237
else if(model == 3 || model == 5) return p2;
238
else if(model == 6 ) return p2m; //pentium II mobile/celeron
239
else if( model==7 || model == 8 ) //celeron/p3
244
case 0xf: // extended family
245
if((reax & 0x0ff00000) == 0 && model == 0) return p4;
249
else if(strcmp(vendor, "AuthenticAMD") == 0){
257
else if(family == 6){
274
static inline int asmcpy_works()
276
CPUTYPE type = cpu_check();
278
if( type == p3 || type == K7 || type == p4)
284
#include "tas-i386.h"
285
int _x86copy_mutex=0;
289
// 128>n<2048 MMX copy
290
// n >2047 MMX copy with buffer
292
void *armci_asm_memcpy_nofence(void *dst, const void *src, size_t n, int bufid)
297
if(use_asm_copy<0) use_asm_copy = asmcpy_works();
298
if(!use_asm_copy || (n<128) ) return memcpy(dst, src, n);
301
/* memcpy4 has problems in multithreaded environment -- we allow only
302
* one thread to use it
304
if(n>=2048)locked = testandset((int*)&_x86copy_mutex);
307
residual = (int)n % 64;
308
if(residual != 0) memcpy(dst, src, residual);
310
asm_memcpy3((char*)dst+residual, (char*)src+residual, n - residual);
313
residual = (int)n % 2048;
316
int res64 = residual%64;
317
if(res64 != 0) memcpy(dst, src, res64);
319
asm_memcpy3((char*)dst+res64, (char*)src+res64, residual - res64);
323
asm_memcpy4((char*)dst + residual, (char*)src + residual,
324
n - residual, bufid);
334
void *armci_asm_memcpy(void *dst, const void *src, size_t n, int bufid)
337
if(use_asm_copy<0) use_asm_copy = asmcpy_works();
338
if(!use_asm_copy ||n<128) return memcpy(dst, src, n);
339
p = armci_asm_memcpy_nofence(dst,src,n,bufid);
340
__asm__ __volatile__ ("sfence":::"memory");
345
void armci_asm_mem_fence()
347
__asm__ __volatile__ ("sfence":::"memory");