7
8
NAMESPACE_BEGIN(CryptoPP)
10
11
void Panama<B>::Reset()
13
memset(m_state, 0, m_state.size()*4);
13
memset(m_state, 0, m_state.SizeInBytes());
14
#if CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE
15
m_state[17] = HasSSSE3();
19
#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
21
#pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
23
void Panama_SSE2_Pull(size_t count, word32 *state, word32 *z, const word32 *y)
28
".intel_syntax noprefix;"
31
AS2( mov WORD_REG(cx), count)
32
AS2( mov WORD_REG(si), state)
33
AS2( mov WORD_REG(di), z)
34
AS2( mov WORD_REG(dx), y)
36
AS2( shl WORD_REG(cx), 5)
38
AS2( mov ebx, [WORD_REG(si)+4*17])
39
AS2( add WORD_REG(cx), WORD_REG(bx))
44
AS2( movdqa xmm0, [WORD_REG(si)+0*16])
45
AS2( movdqa xmm1, [WORD_REG(si)+1*16])
46
AS2( movdqa xmm2, [WORD_REG(si)+2*16])
47
AS2( movdqa xmm3, [WORD_REG(si)+3*16])
48
AS2( mov eax, [WORD_REG(si)+4*16])
52
#if CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE
53
AS2( test WORD_REG(bx), 1)
56
AS2( movdqa xmm6, xmm2)
57
AS2( movss xmm6, xmm3)
58
ASS( pshufd xmm5, xmm6, 0, 3, 2, 1)
60
AS2( movdqa xmm7, xmm3)
61
AS2( movss xmm7, xmm6)
62
ASS( pshufd xmm6, xmm7, 0, 3, 2, 1)
63
#if CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE
66
AS2( movdqa xmm5, xmm3)
67
AS3( palignr xmm5, xmm2, 4)
69
AS3( palignr xmm6, xmm3, 4)
79
#define SSE2_Index(i) ASM_MOD(((i)*13+16), 17)
83
AS2( rol ecx, ASM_MOD((ASM_MOD(5*i,17)*(ASM_MOD(5*i,17)+1)/2), 32))\
84
AS2( mov [WORD_REG(si)+SSE2_Index(ASM_MOD(5*(i), 17))*4], ecx)
86
#define pi4(x, y, z, a, b, c, d) \
87
AS2( pcmpeqb xmm7, xmm7)\
92
ASS( pshuflw xmm7, xmm7, 1, 0, 3, 2)\
94
AS2( punpckhqdq xmm7, xmm7)\
96
ASS( pshuflw xmm7, xmm7, 1, 0, 3, 2)\
99
pi4(xmm1, xmm2, xmm3, 1, 5, 9, 13)
100
pi4(xmm0, xmm1, xmm2, 2, 6, 10, 14)
101
pi4(xmm6, xmm0, xmm1, 3, 7, 11, 15)
102
pi4(xmm5, xmm6, xmm0, 4, 8, 12, 16)
104
// output keystream and update buffer here to hide partial memory stalls between pi and theta
105
AS2( movdqa xmm4, xmm3)
106
AS2( punpcklqdq xmm3, xmm2) // 1 5 2 6
107
AS2( punpckhdq xmm4, xmm2) // 9 10 13 14
108
AS2( movdqa xmm2, xmm1)
109
AS2( punpcklqdq xmm1, xmm0) // 3 7 4 8
110
AS2( punpckhdq xmm2, xmm0) // 11 12 15 16
113
AS2( test WORD_REG(di), WORD_REG(di))
115
AS2( movdqa xmm6, xmm4)
116
AS2( punpcklqdq xmm4, xmm2)
117
AS2( punpckhqdq xmm6, xmm2)
118
AS2( test WORD_REG(dx), 0xf)
120
AS2( test WORD_REG(dx), WORD_REG(dx))
122
AS2( pxor xmm4, [WORD_REG(dx)])
123
AS2( pxor xmm6, [WORD_REG(dx)+16])
124
AS2( add WORD_REG(dx), 32)
127
AS2( movdqu xmm0, [WORD_REG(dx)])
128
AS2( movdqu xmm2, [WORD_REG(dx)+16])
129
AS2( pxor xmm4, xmm0)
130
AS2( pxor xmm6, xmm2)
131
AS2( add WORD_REG(dx), 32)
133
AS2( test WORD_REG(di), 0xf)
135
AS2( movdqa [WORD_REG(di)], xmm4)
136
AS2( movdqa [WORD_REG(di)+16], xmm6)
137
AS2( add WORD_REG(di), 32)
140
AS2( movdqu [WORD_REG(di)], xmm4)
141
AS2( movdqu [WORD_REG(di)+16], xmm6)
142
AS2( add WORD_REG(di), 32)
146
AS2( lea WORD_REG(cx), [WORD_REG(bx) + 32])
147
AS2( and WORD_REG(cx), 31*32)
148
AS2( lea WORD_REG(bp), [WORD_REG(bx) + (32-24)*32])
149
AS2( and WORD_REG(bp), 31*32)
151
AS2( movdqa xmm0, [WORD_REG(si)+20*4+WORD_REG(cx)+0*8])
152
AS2( pxor xmm3, xmm0)
153
ASS( pshufd xmm0, xmm0, 2, 3, 0, 1)
154
AS2( movdqa [WORD_REG(si)+20*4+WORD_REG(cx)+0*8], xmm3)
155
AS2( pxor xmm0, [WORD_REG(si)+20*4+WORD_REG(bp)+2*8])
156
AS2( movdqa [WORD_REG(si)+20*4+WORD_REG(bp)+2*8], xmm0)
158
AS2( movdqa xmm4, [WORD_REG(si)+20*4+WORD_REG(cx)+2*8])
159
AS2( pxor xmm1, xmm4)
160
AS2( movdqa [WORD_REG(si)+20*4+WORD_REG(cx)+2*8], xmm1)
161
AS2( pxor xmm4, [WORD_REG(si)+20*4+WORD_REG(bp)+0*8])
162
AS2( movdqa [WORD_REG(si)+20*4+WORD_REG(bp)+0*8], xmm4)
165
AS2( movdqa xmm3, [WORD_REG(si)+3*16])
166
AS2( movdqa xmm2, [WORD_REG(si)+2*16])
167
AS2( movdqa xmm1, [WORD_REG(si)+1*16])
168
AS2( movdqa xmm0, [WORD_REG(si)+0*16])
170
#if CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE
171
AS2( test WORD_REG(bx), 1)
175
AS2( movdqa xmm7, xmm3)
176
AS2( movss xmm7, xmm6)
177
AS2( movdqa xmm6, xmm2)
178
AS2( movss xmm6, xmm3)
179
AS2( movdqa xmm5, xmm1)
180
AS2( movss xmm5, xmm2)
181
AS2( movdqa xmm4, xmm0)
182
AS2( movss xmm4, xmm1)
183
ASS( pshufd xmm7, xmm7, 0, 3, 2, 1)
184
ASS( pshufd xmm6, xmm6, 0, 3, 2, 1)
185
ASS( pshufd xmm5, xmm5, 0, 3, 2, 1)
186
ASS( pshufd xmm4, xmm4, 0, 3, 2, 1)
187
#if CRYPTOPP_BOOL_SSSE3_ASM_AVAILABLE
191
AS3( palignr xmm7, xmm3, 4)
192
AS2( movq xmm6, xmm3)
193
AS3( palignr xmm6, xmm2, 4)
194
AS2( movq xmm5, xmm2)
195
AS3( palignr xmm5, xmm1, 4)
196
AS2( movq xmm4, xmm1)
197
AS3( palignr xmm4, xmm0, 4)
207
AS2( pxor xmm3, xmm2)
208
AS2( pxor xmm2, xmm1)
209
AS2( pxor xmm1, xmm0)
210
AS2( pxor xmm0, xmm7)
211
AS2( pxor xmm3, xmm7)
212
AS2( pxor xmm2, xmm6)
213
AS2( pxor xmm1, xmm5)
214
AS2( pxor xmm0, xmm4)
217
AS2( lea WORD_REG(cx), [WORD_REG(bx) + (32-4)*32])
218
AS2( and WORD_REG(cx), 31*32)
219
AS2( lea WORD_REG(bp), [WORD_REG(bx) + 16*32])
220
AS2( and WORD_REG(bp), 31*32)
222
AS2( movdqa xmm4, [WORD_REG(si)+20*4+WORD_REG(cx)+0*16])
223
AS2( movdqa xmm5, [WORD_REG(si)+20*4+WORD_REG(bp)+0*16])
224
AS2( movdqa xmm6, xmm4)
225
AS2( punpcklqdq xmm4, xmm5)
226
AS2( punpckhqdq xmm6, xmm5)
227
AS2( pxor xmm3, xmm4)
228
AS2( pxor xmm2, xmm6)
230
AS2( movdqa xmm4, [WORD_REG(si)+20*4+WORD_REG(cx)+1*16])
231
AS2( movdqa xmm5, [WORD_REG(si)+20*4+WORD_REG(bp)+1*16])
232
AS2( movdqa xmm6, xmm4)
233
AS2( punpcklqdq xmm4, xmm5)
234
AS2( punpckhqdq xmm6, xmm5)
235
AS2( pxor xmm1, xmm4)
236
AS2( pxor xmm0, xmm6)
239
AS2( add WORD_REG(bx), 32)
240
AS2( cmp WORD_REG(bx), [WORD_REG(sp)])
244
AS2( add WORD_REG(sp), WORD_SZ)
246
AS2( mov [WORD_REG(si)+4*16], eax)
247
AS2( movdqa [WORD_REG(si)+3*16], xmm3)
248
AS2( movdqa [WORD_REG(si)+2*16], xmm2)
249
AS2( movdqa [WORD_REG(si)+1*16], xmm1)
250
AS2( movdqa [WORD_REG(si)+0*16], xmm0)
255
".att_syntax prefix;"
257
: "c" (count), "S" (state), "D" (z), "d" (y)
258
: "%eax", "memory", "cc"
16
265
template <class B>
17
266
void Panama<B>::Iterate(size_t count, const word32 *p, word32 *z, const word32 *y)
19
unsigned int bstart = m_bstart;
20
word32 *const a = m_state;
22
#define b ((Stage *)(a+34))
268
word32 bstart = m_state[17];
269
word32 *const aPtr = m_state;
272
#define bPtr ((byte *)(aPtr+20))
274
// reorder the state for SSE2
275
// a and c: 4 8 12 16 | 3 7 11 15 | 2 6 10 14 | 1 5 9 13 | 0
276
// xmm0 xmm1 xmm2 xmm3 eax
277
#define a(i) aPtr[((i)*13+16) % 17] // 13 is inverse of 4 mod 17
278
#define c(i) cPtr[((i)*13+16) % 17]
279
// b: 0 4 | 1 5 | 2 6 | 3 7
280
#define b(i, j) b##i[(j)*2%8 + (j)/4]
25
#define OA(i) z[i] = ConditionalByteReverse(B::ToEnum(), a[i+9])
26
#define OX(i) z[i] = y[i] ^ ConditionalByteReverse(B::ToEnum(), a[i+9])
283
#define OA(i) z[i] = ConditionalByteReverse(B::ToEnum(), a(i+9))
284
#define OX(i) z[i] = y[i] ^ ConditionalByteReverse(B::ToEnum(), a(i+9))
28
#define US(i) {word32 t=b0[i]; b0[i]=ConditionalByteReverse(B::ToEnum(), p[i])^t; b25[(i+6)%8]^=t;}
29
#define UL(i) {word32 t=b0[i]; b0[i]=a[i+1]^t; b25[(i+6)%8]^=t;}
286
#define US(i) {word32 t=b(0,i); b(0,i)=ConditionalByteReverse(B::ToEnum(), p[i])^t; b(25,(i+6)%8)^=t;}
287
#define UL(i) {word32 t=b(0,i); b(0,i)=a(i+1)^t; b(25,(i+6)%8)^=t;}
31
#define GP(i) c[5*i%17] = rotlFixed(a[i] ^ (a[(i+1)%17] | ~a[(i+2)%17]), ((5*i%17)*((5*i%17)+1)/2)%32)
289
#define GP(i) c(5*i%17) = rotlFixed(a(i) ^ (a((i+1)%17) | ~a((i+2)%17)), ((5*i%17)*((5*i%17)+1)/2)%32)
32
290
// theta and sigma
33
#define T(i,x) a[i] = c[i] ^ c[(i+1)%17] ^ c[(i+4)%17] ^ x
291
#define T(i,x) a(i) = c(i) ^ c((i+1)%17) ^ c((i+4)%17) ^ x
34
292
#define TS1S(i) T(i+1, ConditionalByteReverse(B::ToEnum(), p[i]))
35
#define TS1L(i) T(i+1, b4[i])
36
#define TS2(i) T(i+9, b16[i])
293
#define TS1L(i) T(i+1, b(4,i))
294
#define TS2(i) T(i+9, b(16,i))