15
15
* Nonstandard instructions used:
19
void *ac_memcpy_mmx(void *dest, const void *src, size_t bytes)
22
#PENTIUM_LINE_SIZE = 32 # PMMX/PII cache line size \n\
23
#PENTIUM_CACHE_SIZE = 8192 # PMMX/PII total cache size \n\
24
# Use only half because writes may touch the cache too (PII) \n\
25
#PENTIUM_CACHE_BLOCK = (PENTIUM_CACHE_SIZE/2 - PENTIUM_LINE_SIZE) \n\
27
push %%ebx # Save PIC register \n\
28
push %%edi # Save destination for return value \n\
29
cld # MOVS* should ascend \n\
31
mov $64, %%ebx # Constant \n\
34
jb mmx.memcpy_last # Just use movs if <64 bytes \n\
36
# First align destination address to a multiple of 8 bytes \n\
37
mov $8, %%eax # EAX <- (8-dest) & 7 \n\
39
and $0b111, %%eax # ... which is the number of bytes to copy\n\
40
lea 0f, %%edx # Use a computed jump--faster than a loop\n\
42
jmp *%%edx # Execute 0-7 MOVSB's \n\
50
0: sub %%eax, %%ecx # Update count \n\
52
# Now copy data in blocks \n\
53
0: mov %%ecx, %%edx # EDX <- ECX >> 6 (cache lines to copy) \n\
55
jz mmx.memcpy_last # <64 bytes left? Skip to end \n\
56
cmp 63, %%edx # Solaris x86 fix \n\
57
jb 1f # Limit size of block \n\
58
mov 63, %%edx # Solaris x86 fix \n\
59
1: mov %%edx, %%eax # EAX <- EDX << 6 (bytes to copy) \n\
61
sub %%eax, %%ecx # Update remaining count \n\
62
add %%eax, %%esi # Point to end of region to be block-copied\n\
63
2: test %%eax, -32(%%esi) # Touch each cache line in reverse order\n\
64
test %%eax, -64(%%esi) \n\
65
sub %%ebx, %%esi # Update pointer \n\
66
sub %%ebx, %%eax # And loop \n\
68
# Note that ESI now points to the beginning of the block \n\
69
3: movq (%%esi), %%mm0 # Do the actual copy, 64 bytes at a time\n\
70
movq 8(%%esi), %%mm1 \n\
71
movq 16(%%esi), %%mm2 \n\
72
movq 24(%%esi), %%mm3 \n\
73
movq 32(%%esi), %%mm4 \n\
74
movq 40(%%esi), %%mm5 \n\
75
movq 48(%%esi), %%mm6 \n\
76
movq 56(%%esi), %%mm7 \n\
77
movq %%mm0, (%%edi) \n\
78
movq %%mm1, 8(%%edi) \n\
79
movq %%mm2, 16(%%edi) \n\
80
movq %%mm3, 24(%%edi) \n\
81
movq %%mm4, 32(%%edi) \n\
82
movq %%mm5, 40(%%edi) \n\
83
movq %%mm6, 48(%%edi) \n\
84
movq %%mm7, 56(%%edi) \n\
85
add %%ebx, %%esi # Update pointers \n\
87
dec %%edx # And loop \n\
92
# Copy last <64 bytes, using the computed jump trick \n\
93
mov %%ecx, %%eax # EAX <- ECX>>2 \n\
97
jmp *%%edx # Execute 0-15 MOVSD's \n\
113
0: and $0b11, %%ecx # ECX <- ECX & 3 \n\
116
jmp *%%edx # Execute 0-3 MOVSB's \n\
122
emms # Clean up MMX state \n\
123
pop %%edi # Restore destination (return value) \n\
124
pop %%ebx # Restore PIC register \n\
126
: "D" (dest), "S" (src), "c" (bytes)
19
132
void *ac_memcpy_mmx(void *dest, const void *src, size_t bytes)