1
From a0ac24d98ace90d1ccba6a2f3e7d55600f2fdb6e Mon Sep 17 00:00:00 2001
2
From: H.J. Lu <hongjiu.lu@intel.com>
3
Date: Wed, 24 Feb 2010 18:20:57 -0800
4
Subject: [PATCH] Fix issues in x86 memcpy-ssse3.S
6
* sysdeps/i386/i686/multiarch/memcpy-ssse3.S: Use unsigned
14
sysdeps/i386/i686/multiarch/memcpy-ssse3.S | 113 ++++++++++++++++++----------
15
2 files changed, 77 insertions(+), 40 deletions(-)
17
diff --git a/sysdeps/i386/i686/multiarch/memcpy-ssse3.S b/sysdeps/i386/i686/multiarch/memcpy-ssse3.S
18
index 749c82d..ec9eeb9 100644
19
--- a/sysdeps/i386/i686/multiarch/memcpy-ssse3.S
20
+++ b/sysdeps/i386/i686/multiarch/memcpy-ssse3.S
21
@@ -128,7 +128,7 @@ ENTRY (MEMCPY)
23
je L(fwd_write_0bytes)
27
jmp L(bk_write_less32bytes_2)
30
@@ -139,12 +139,12 @@ L(memmove_bwd):
34
- jge L(48bytesormore)
35
+ jae L(48bytesormore)
37
L(fwd_write_less32bytes):
38
#ifndef USE_AS_MEMMOVE
45
@@ -162,6 +162,7 @@ L(48bytesormore):
53
@@ -181,12 +182,14 @@ L(48bytesormore):
62
BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
69
@@ -202,7 +205,7 @@ L(shl_0_loop):
70
movdqa %xmm0, (%edx, %edi)
71
movdqa %xmm1, 16(%edx, %edi)
76
movdqa (%eax, %edi), %xmm0
77
movdqa 16(%eax, %edi), %xmm1
78
@@ -210,7 +213,7 @@ L(shl_0_loop):
79
movdqa %xmm0, (%edx, %edi)
80
movdqa %xmm1, 16(%edx, %edi)
85
movdqa (%eax, %edi), %xmm0
86
movdqa 16(%eax, %edi), %xmm1
87
@@ -218,7 +221,7 @@ L(shl_0_loop):
88
movdqa %xmm0, (%edx, %edi)
89
movdqa %xmm1, 16(%edx, %edi)
94
movdqa (%eax, %edi), %xmm0
95
movdqa 16(%eax, %edi), %xmm1
96
@@ -234,6 +237,7 @@ L(shl_0_end):
98
BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
103
#ifdef DATA_CACHE_SIZE_HALF
104
@@ -250,7 +254,7 @@ L(shl_0_gobble):
108
- jge L(shl_0_gobble_mem_loop)
109
+ jae L(shl_0_gobble_mem_loop)
110
L(shl_0_gobble_cache_loop):
112
movdqa 0x10(%eax), %xmm1
113
@@ -272,8 +276,7 @@ L(shl_0_gobble_cache_loop):
114
movdqa %xmm7, 0x70(%edx)
117
- jge L(shl_0_gobble_cache_loop)
118
-L(shl_0_gobble_cache_loop_tail):
119
+ jae L(shl_0_gobble_cache_loop)
122
jl L(shl_0_cache_less_64bytes)
123
@@ -294,7 +297,7 @@ L(shl_0_gobble_cache_loop_tail):
125
L(shl_0_cache_less_64bytes):
127
- jl L(shl_0_cache_less_32bytes)
128
+ jb L(shl_0_cache_less_32bytes)
131
movdqa 0x10(%eax), %xmm1
132
@@ -304,7 +307,7 @@ L(shl_0_cache_less_64bytes):
134
L(shl_0_cache_less_32bytes):
136
- jl L(shl_0_cache_less_16bytes)
137
+ jb L(shl_0_cache_less_16bytes)
141
@@ -342,7 +345,7 @@ L(shl_0_gobble_mem_loop):
142
movdqa %xmm7, 0x70(%edx)
145
- jge L(shl_0_gobble_mem_loop)
146
+ jae L(shl_0_gobble_mem_loop)
149
jl L(shl_0_mem_less_64bytes)
150
@@ -363,7 +366,7 @@ L(shl_0_gobble_mem_loop):
152
L(shl_0_mem_less_64bytes):
154
- jl L(shl_0_mem_less_32bytes)
155
+ jb L(shl_0_mem_less_32bytes)
158
movdqa 0x10(%eax), %xmm1
159
@@ -373,7 +376,7 @@ L(shl_0_mem_less_64bytes):
161
L(shl_0_mem_less_32bytes):
163
- jl L(shl_0_mem_less_16bytes)
164
+ jb L(shl_0_mem_less_16bytes)
168
@@ -384,7 +387,8 @@ L(shl_0_mem_less_16bytes):
170
BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
177
BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
178
@@ -406,7 +410,7 @@ L(shl_1_loop):
179
movdqa %xmm2, -32(%edx, %edi)
180
movdqa %xmm3, -16(%edx, %edi)
185
movdqa 16(%eax, %edi), %xmm2
187
@@ -428,6 +432,8 @@ L(shl_1_end):
189
BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
195
BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
196
@@ -449,7 +455,7 @@ L(shl_2_loop):
197
movdqa %xmm2, -32(%edx, %edi)
198
movdqa %xmm3, -16(%edx, %edi)
203
movdqa 16(%eax, %edi), %xmm2
205
@@ -471,6 +477,8 @@ L(shl_2_end):
207
BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
213
BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
214
@@ -492,7 +500,7 @@ L(shl_3_loop):
215
movdqa %xmm2, -32(%edx, %edi)
216
movdqa %xmm3, -16(%edx, %edi)
221
movdqa 16(%eax, %edi), %xmm2
223
@@ -514,6 +522,8 @@ L(shl_3_end):
225
BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
231
BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
232
@@ -535,7 +545,7 @@ L(shl_4_loop):
233
movdqa %xmm2, -32(%edx, %edi)
234
movdqa %xmm3, -16(%edx, %edi)
239
movdqa 16(%eax, %edi), %xmm2
241
@@ -557,6 +567,8 @@ L(shl_4_end):
243
BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
249
BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
250
@@ -578,7 +590,7 @@ L(shl_5_loop):
251
movdqa %xmm2, -32(%edx, %edi)
252
movdqa %xmm3, -16(%edx, %edi)
257
movdqa 16(%eax, %edi), %xmm2
259
@@ -600,7 +612,8 @@ L(shl_5_end):
261
BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
268
BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
269
@@ -622,7 +635,7 @@ L(shl_6_loop):
270
movdqa %xmm2, -32(%edx, %edi)
271
movdqa %xmm3, -16(%edx, %edi)
276
movdqa 16(%eax, %edi), %xmm2
278
@@ -644,6 +657,8 @@ L(shl_6_end):
280
BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
286
BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
287
@@ -665,7 +680,7 @@ L(shl_7_loop):
288
movdqa %xmm2, -32(%edx, %edi)
289
movdqa %xmm3, -16(%edx, %edi)
294
movdqa 16(%eax, %edi), %xmm2
296
@@ -687,6 +702,8 @@ L(shl_7_end):
298
BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
304
BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
305
@@ -708,7 +725,7 @@ L(shl_8_loop):
306
movdqa %xmm2, -32(%edx, %edi)
307
movdqa %xmm3, -16(%edx, %edi)
312
movdqa 16(%eax, %edi), %xmm2
314
@@ -730,6 +747,8 @@ L(shl_8_end):
316
BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
322
BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
323
@@ -751,7 +770,7 @@ L(shl_9_loop):
324
movdqa %xmm2, -32(%edx, %edi)
325
movdqa %xmm3, -16(%edx, %edi)
330
movdqa 16(%eax, %edi), %xmm2
332
@@ -773,6 +792,8 @@ L(shl_9_end):
334
BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
340
BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
341
@@ -794,7 +815,7 @@ L(shl_10_loop):
342
movdqa %xmm2, -32(%edx, %edi)
343
movdqa %xmm3, -16(%edx, %edi)
348
movdqa 16(%eax, %edi), %xmm2
350
@@ -816,6 +837,8 @@ L(shl_10_end):
352
BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
358
BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
359
@@ -837,7 +860,7 @@ L(shl_11_loop):
360
movdqa %xmm2, -32(%edx, %edi)
361
movdqa %xmm3, -16(%edx, %edi)
366
movdqa 16(%eax, %edi), %xmm2
368
@@ -859,6 +882,8 @@ L(shl_11_end):
370
BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
376
BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
377
@@ -880,7 +905,7 @@ L(shl_12_loop):
378
movdqa %xmm2, -32(%edx, %edi)
379
movdqa %xmm3, -16(%edx, %edi)
384
movdqa 16(%eax, %edi), %xmm2
386
@@ -902,6 +927,8 @@ L(shl_12_end):
388
BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
394
BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
395
@@ -923,7 +950,7 @@ L(shl_13_loop):
396
movdqa %xmm2, -32(%edx, %edi)
397
movdqa %xmm3, -16(%edx, %edi)
402
movdqa 16(%eax, %edi), %xmm2
404
@@ -945,6 +972,8 @@ L(shl_13_end):
406
BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
412
BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
413
@@ -966,7 +995,7 @@ L(shl_14_loop):
414
movdqa %xmm2, -32(%edx, %edi)
415
movdqa %xmm3, -16(%edx, %edi)
420
movdqa 16(%eax, %edi), %xmm2
422
@@ -988,7 +1017,8 @@ L(shl_14_end):
424
BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
431
BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
432
@@ -1010,7 +1040,7 @@ L(shl_15_loop):
433
movdqa %xmm2, -32(%edx, %edi)
434
movdqa %xmm3, -16(%edx, %edi)
439
movdqa 16(%eax, %edi), %xmm2
441
@@ -1229,8 +1259,10 @@ L(fwd_write_3bytes):
442
movl DEST(%esp), %eax
453
@@ -1281,7 +1313,7 @@ L(large_page_loop):
455
L(large_page_less_64bytes):
457
- jl L(large_page_less_32bytes)
458
+ jb L(large_page_less_32bytes)
460
movdqu 0x10(%eax), %xmm1
462
@@ -1617,11 +1649,11 @@ L(copy_backward):
466
- jge L(bk_write_more64bytes)
467
+ jae L(bk_write_more64bytes)
469
L(bk_write_64bytesless):
471
- jl L(bk_write_less32bytes)
472
+ jb L(bk_write_less32bytes)
474
L(bk_write_more32bytes):
475
/* Copy 32 bytes at a time. */
476
@@ -1653,10 +1685,11 @@ L(bk_write_less32bytes):
477
L(bk_write_less32bytes_2):
478
BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
484
- jle L(bk_write_less32bytes)
485
+ jbe L(bk_write_less32bytes)
487
/* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
488
then (EDX & 2) must be != 0. */
489
@@ -1712,7 +1745,7 @@ L(bk_ssse3_align):
493
- jl L(bk_write_more32bytes)
494
+ jb L(bk_write_more32bytes)
498
@@ -1727,7 +1760,7 @@ L(bk_ssse3_cpy):
502
- jge L(bk_ssse3_cpy)
503
+ jae L(bk_ssse3_cpy)
504
jmp L(bk_write_64bytesless)