19
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
24
#endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
26
#ifdef CRYPTOPP_X64_MASM_AVAILABLE
28
void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y);
30
#elif CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
32
#ifdef CRYPTOPP_GENERATE_X64_MASM
33
Panama_SSE2_Pull PROC FRAME
21
39
#pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
23
void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y)
40
void CRYPTOPP_NOINLINE Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y)
26
43
__asm__ __volatile__
28
45
".intel_syntax noprefix;"
31
AS2( mov WORD_REG(cx), count)
32
AS2( mov WORD_REG(si), state)
33
AS2( mov WORD_REG(di), z)
34
AS2( mov WORD_REG(dx), y)
36
AS2( shl WORD_REG(cx), 5)
48
AS2( mov AS_REG_1, count)
49
AS2( mov AS_REG_2, state)
53
#endif // #ifdef CRYPTOPP_GENERATE_X64_MASM
56
#define REG_loopEnd [esp]
57
#elif defined(CRYPTOPP_GENERATE_X64_MASM)
58
#define REG_loopEnd rdi
60
#define REG_loopEnd r8
38
AS2( mov ebx, [WORD_REG(si)+4*17])
39
AS2( add WORD_REG(cx), WORD_REG(bx))
44
AS2( movdqa xmm0, [WORD_REG(si)+0*16])
45
AS2( movdqa xmm1, [WORD_REG(si)+1*16])
46
AS2( movdqa xmm2, [WORD_REG(si)+2*16])
47
AS2( movdqa xmm3, [WORD_REG(si)+3*16])
48
AS2( mov eax, [WORD_REG(si)+4*16])
65
AS2( mov AS_REG_6d, [AS_REG_2+4*17])
66
AS2( add AS_REG_1, AS_REG_6)
69
AS2( mov REG_loopEnd, AS_REG_1)
75
AS2( movdqa xmm0, XMMWORD_PTR [AS_REG_2+0*16])
76
AS2( movdqa xmm1, XMMWORD_PTR [AS_REG_2+1*16])
77
AS2( movdqa xmm2, XMMWORD_PTR [AS_REG_2+2*16])
78
AS2( movdqa xmm3, XMMWORD_PTR [AS_REG_2+3*16])
79
AS2( mov eax, dword ptr [AS_REG_2+4*16])
52
83
#if CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE
53
AS2( test WORD_REG(bx), 1)
84
AS2( test AS_REG_6, 1)
56
87
AS2( movdqa xmm6, xmm2)
104
AS2( movd AS_REG_1d, xmm2)
106
AS2( movd AS_REG_7d, xmm3)
107
AS2( or AS_REG_1d, AS_REG_7d)
108
AS2( xor eax, AS_REG_1d)
79
110
#define SSE2_Index(i) ASM_MOD(((i)*13+16), 17)
83
AS2( rol ecx, ASM_MOD((ASM_MOD(5*i,17)*(ASM_MOD(5*i,17)+1)/2), 32))\
84
AS2( mov [WORD_REG(si)+SSE2_Index(ASM_MOD(5*(i), 17))*4], ecx)
113
AS2( movd AS_REG_1d, xmm7)\
114
AS2( rol AS_REG_1d, ASM_MOD((ASM_MOD(5*i,17)*(ASM_MOD(5*i,17)+1)/2), 32))\
115
AS2( mov [AS_REG_2+SSE2_Index(ASM_MOD(5*(i), 17))*4], AS_REG_1d)
86
117
#define pi4(x, y, z, a, b, c, d) \
87
118
AS2( pcmpeqb xmm7, xmm7)\
110
141
AS2( punpckhdq xmm2, xmm0) // 11 12 15 16
113
AS2( test WORD_REG(di), WORD_REG(di))
144
AS2( test AS_REG_3, AS_REG_3)
115
146
AS2( movdqa xmm6, xmm4)
116
147
AS2( punpcklqdq xmm4, xmm2)
117
148
AS2( punpckhqdq xmm6, xmm2)
118
AS2( test WORD_REG(dx), 0xf)
149
AS2( test AS_REG_4, 15)
120
AS2( test WORD_REG(dx), WORD_REG(dx))
151
AS2( test AS_REG_4, AS_REG_4)
122
AS2( pxor xmm4, [WORD_REG(dx)])
123
AS2( pxor xmm6, [WORD_REG(dx)+16])
124
AS2( add WORD_REG(dx), 32)
153
AS2( pxor xmm4, [AS_REG_4])
154
AS2( pxor xmm6, [AS_REG_4+16])
155
AS2( add AS_REG_4, 32)
127
AS2( movdqu xmm0, [WORD_REG(dx)])
128
AS2( movdqu xmm2, [WORD_REG(dx)+16])
158
AS2( movdqu xmm0, [AS_REG_4])
159
AS2( movdqu xmm2, [AS_REG_4+16])
129
160
AS2( pxor xmm4, xmm0)
130
161
AS2( pxor xmm6, xmm2)
131
AS2( add WORD_REG(dx), 32)
162
AS2( add AS_REG_4, 32)
133
AS2( test WORD_REG(di), 0xf)
164
AS2( test AS_REG_3, 15)
135
AS2( movdqa [WORD_REG(di)], xmm4)
136
AS2( movdqa [WORD_REG(di)+16], xmm6)
137
AS2( add WORD_REG(di), 32)
166
AS2( movdqa XMMWORD_PTR [AS_REG_3], xmm4)
167
AS2( movdqa XMMWORD_PTR [AS_REG_3+16], xmm6)
168
AS2( add AS_REG_3, 32)
140
AS2( movdqu [WORD_REG(di)], xmm4)
141
AS2( movdqu [WORD_REG(di)+16], xmm6)
142
AS2( add WORD_REG(di), 32)
171
AS2( movdqu XMMWORD_PTR [AS_REG_3], xmm4)
172
AS2( movdqu XMMWORD_PTR [AS_REG_3+16], xmm6)
173
AS2( add AS_REG_3, 32)
146
AS2( lea WORD_REG(cx), [WORD_REG(bx) + 32])
147
AS2( and WORD_REG(cx), 31*32)
148
AS2( lea WORD_REG(bp), [WORD_REG(bx) + (32-24)*32])
149
AS2( and WORD_REG(bp), 31*32)
177
AS2( lea AS_REG_1, [AS_REG_6 + 32])
178
AS2( and AS_REG_1, 31*32)
179
AS2( lea AS_REG_7, [AS_REG_6 + (32-24)*32])
180
AS2( and AS_REG_7, 31*32)
151
AS2( movdqa xmm0, [WORD_REG(si)+20*4+WORD_REG(cx)+0*8])
182
AS2( movdqa xmm0, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+0*8])
152
183
AS2( pxor xmm3, xmm0)
153
184
ASS( pshufd xmm0, xmm0, 2, 3, 0, 1)
154
AS2( movdqa [WORD_REG(si)+20*4+WORD_REG(cx)+0*8], xmm3)
155
AS2( pxor xmm0, [WORD_REG(si)+20*4+WORD_REG(bp)+2*8])
156
AS2( movdqa [WORD_REG(si)+20*4+WORD_REG(bp)+2*8], xmm0)
185
AS2( movdqa XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+0*8], xmm3)
186
AS2( pxor xmm0, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+2*8])
187
AS2( movdqa XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+2*8], xmm0)
158
AS2( movdqa xmm4, [WORD_REG(si)+20*4+WORD_REG(cx)+2*8])
189
AS2( movdqa xmm4, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+2*8])
159
190
AS2( pxor xmm1, xmm4)
160
AS2( movdqa [WORD_REG(si)+20*4+WORD_REG(cx)+2*8], xmm1)
161
AS2( pxor xmm4, [WORD_REG(si)+20*4+WORD_REG(bp)+0*8])
162
AS2( movdqa [WORD_REG(si)+20*4+WORD_REG(bp)+0*8], xmm4)
191
AS2( movdqa XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+2*8], xmm1)
192
AS2( pxor xmm4, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+0*8])
193
AS2( movdqa XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+0*8], xmm4)
165
AS2( movdqa xmm3, [WORD_REG(si)+3*16])
166
AS2( movdqa xmm2, [WORD_REG(si)+2*16])
167
AS2( movdqa xmm1, [WORD_REG(si)+1*16])
168
AS2( movdqa xmm0, [WORD_REG(si)+0*16])
196
AS2( movdqa xmm3, XMMWORD_PTR [AS_REG_2+3*16])
197
AS2( movdqa xmm2, XMMWORD_PTR [AS_REG_2+2*16])
198
AS2( movdqa xmm1, XMMWORD_PTR [AS_REG_2+1*16])
199
AS2( movdqa xmm0, XMMWORD_PTR [AS_REG_2+0*16])
170
201
#if CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE
171
AS2( test WORD_REG(bx), 1)
202
AS2( test AS_REG_6, 1)
174
205
AS2( movd xmm6, eax)
214
245
AS2( pxor xmm0, xmm4)
217
AS2( lea WORD_REG(cx), [WORD_REG(bx) + (32-4)*32])
218
AS2( and WORD_REG(cx), 31*32)
219
AS2( lea WORD_REG(bp), [WORD_REG(bx) + 16*32])
220
AS2( and WORD_REG(bp), 31*32)
248
AS2( lea AS_REG_1, [AS_REG_6 + (32-4)*32])
249
AS2( and AS_REG_1, 31*32)
250
AS2( lea AS_REG_7, [AS_REG_6 + 16*32])
251
AS2( and AS_REG_7, 31*32)
222
AS2( movdqa xmm4, [WORD_REG(si)+20*4+WORD_REG(cx)+0*16])
223
AS2( movdqa xmm5, [WORD_REG(si)+20*4+WORD_REG(bp)+0*16])
253
AS2( movdqa xmm4, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+0*16])
254
AS2( movdqa xmm5, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+0*16])
224
255
AS2( movdqa xmm6, xmm4)
225
256
AS2( punpcklqdq xmm4, xmm5)
226
257
AS2( punpckhqdq xmm6, xmm5)
227
258
AS2( pxor xmm3, xmm4)
228
259
AS2( pxor xmm2, xmm6)
230
AS2( movdqa xmm4, [WORD_REG(si)+20*4+WORD_REG(cx)+1*16])
231
AS2( movdqa xmm5, [WORD_REG(si)+20*4+WORD_REG(bp)+1*16])
261
AS2( movdqa xmm4, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_1+1*16])
262
AS2( movdqa xmm5, XMMWORD_PTR [AS_REG_2+20*4+AS_REG_7+1*16])
232
263
AS2( movdqa xmm6, xmm4)
233
264
AS2( punpcklqdq xmm4, xmm5)
234
265
AS2( punpckhqdq xmm6, xmm5)
236
267
AS2( pxor xmm0, xmm6)
239
AS2( add WORD_REG(bx), 32)
240
AS2( cmp WORD_REG(bx), [WORD_REG(sp)])
270
AS2( add AS_REG_6, 32)
271
AS2( cmp AS_REG_6, REG_loopEnd)
244
AS2( add WORD_REG(sp), WORD_SZ)
246
AS2( mov [WORD_REG(si)+4*16], eax)
247
AS2( movdqa [WORD_REG(si)+3*16], xmm3)
248
AS2( movdqa [WORD_REG(si)+2*16], xmm2)
249
AS2( movdqa [WORD_REG(si)+1*16], xmm1)
250
AS2( movdqa [WORD_REG(si)+0*16], xmm0)
275
AS2( mov [AS_REG_2+4*16], eax)
276
AS2( movdqa XMMWORD_PTR [AS_REG_2+3*16], xmm3)
277
AS2( movdqa XMMWORD_PTR [AS_REG_2+2*16], xmm2)
278
AS2( movdqa XMMWORD_PTR [AS_REG_2+1*16], xmm1)
279
AS2( movdqa XMMWORD_PTR [AS_REG_2+0*16], xmm0)
281
#if CRYPTOPP_BOOL_X86
255
".att_syntax prefix;"
257
: "c" (count), "S" (state), "D" (z), "d" (y)
258
: "%eax", "memory", "cc"
289
".att_syntax prefix;"
291
#if CRYPTOPP_BOOL_X64
292
: "D" (count), "S" (state), "d" (z), "c" (y)
293
: "%r8", "%r9", "r10", "%eax", "memory", "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7"
295
: "c" (count), "d" (state), "S" (z), "D" (y)
296
: "%eax", "memory", "cc"
300
#ifdef CRYPTOPP_GENERATE_X64_MASM
301
movdqa xmm6, [rsp + 0h]
302
movdqa xmm7, [rsp + 10h]
305
Panama_SSE2_Pull ENDP
309
#endif // #ifdef CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
311
#ifndef CRYPTOPP_GENERATE_X64_MASM
265
313
template <class B>
266
314
void Panama<B>::Iterate(size_t count, const word32 *p, word32 *z, const word32 *y)