119
119
for (i = 1; i < m_rounds; i++) {
122
Td[0*256+Se[GETBYTE(rk[0], 3)]] ^
123
Td[1*256+Se[GETBYTE(rk[0], 2)]] ^
124
Td[2*256+Se[GETBYTE(rk[0], 1)]] ^
125
Td[3*256+Se[GETBYTE(rk[0], 0)]];
122
Td0[Se[GETBYTE(rk[0], 3)]] ^
123
Td1[Se[GETBYTE(rk[0], 2)]] ^
124
Td2[Se[GETBYTE(rk[0], 1)]] ^
125
Td3[Se[GETBYTE(rk[0], 0)]];
127
Td[0*256+Se[GETBYTE(rk[1], 3)]] ^
128
Td[1*256+Se[GETBYTE(rk[1], 2)]] ^
129
Td[2*256+Se[GETBYTE(rk[1], 1)]] ^
130
Td[3*256+Se[GETBYTE(rk[1], 0)]];
127
Td0[Se[GETBYTE(rk[1], 3)]] ^
128
Td1[Se[GETBYTE(rk[1], 2)]] ^
129
Td2[Se[GETBYTE(rk[1], 1)]] ^
130
Td3[Se[GETBYTE(rk[1], 0)]];
132
Td[0*256+Se[GETBYTE(rk[2], 3)]] ^
133
Td[1*256+Se[GETBYTE(rk[2], 2)]] ^
134
Td[2*256+Se[GETBYTE(rk[2], 1)]] ^
135
Td[3*256+Se[GETBYTE(rk[2], 0)]];
132
Td0[Se[GETBYTE(rk[2], 3)]] ^
133
Td1[Se[GETBYTE(rk[2], 2)]] ^
134
Td2[Se[GETBYTE(rk[2], 1)]] ^
135
Td3[Se[GETBYTE(rk[2], 0)]];
137
Td[0*256+Se[GETBYTE(rk[3], 3)]] ^
138
Td[1*256+Se[GETBYTE(rk[3], 2)]] ^
139
Td[2*256+Se[GETBYTE(rk[3], 1)]] ^
140
Td[3*256+Se[GETBYTE(rk[3], 0)]];
137
Td0[Se[GETBYTE(rk[3], 3)]] ^
138
Td1[Se[GETBYTE(rk[3], 2)]] ^
139
Td2[Se[GETBYTE(rk[3], 1)]] ^
140
Td3[Se[GETBYTE(rk[3], 0)]];
145
145
ConditionalByteReverse(BIG_ENDIAN_ORDER, m_key + m_rounds*4, m_key + m_rounds*4, 16);
148
#pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
148
const static unsigned int s_lineSizeDiv4 = CRYPTOPP_L1_CACHE_LINE_SIZE/4;
150
const static unsigned int s_i3=3, s_i2=2, s_i1=1, s_i0=0;
152
const static unsigned int s_i3=0, s_i2=1, s_i1=2, s_i0=3;
150
155
void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
152
#if defined(CRYPTOPP_X86_ASM_AVAILABLE)
155
const word32 *k = m_key;
156
const word32 *kLoopEnd = k + m_rounds*4;
157
#if CRYPTOPP_BOOL_X64
162
#define RESTORE_K_END
163
#define SAVE_0(x) AS2(mov r10d, x)
164
#define SAVE_1(x) AS2(mov r11d, x)
165
#define SAVE_2(x) AS2(mov r12d, x)
166
#define RESTORE_0(x) AS2(mov x, r10d)
167
#define RESTORE_1(x) AS2(mov x, r11d)
168
#define RESTORE_2(x) AS2(mov x, r12d)
171
#define K_END_REG edi
172
#define SAVE_K AS2(movd mm4, esi)
173
#define RESTORE_K AS2(movd esi, mm4)
174
#define RESTORE_K_END AS2(movd edi, mm5)
175
#define SAVE_0(x) AS2(movd mm0, x)
176
#define SAVE_1(x) AS2(movd mm1, x)
177
#define SAVE_2(x) AS2(movd mm2, x)
178
#define RESTORE_0(x) AS2(movd x, mm0)
179
#define RESTORE_1(x) AS2(movd x, mm1)
180
#define RESTORE_2(x) AS2(movd x, mm2)
183
word32 t0, t1, t2, t3;
186
".intel_syntax noprefix;"
189
AS2( mov WORD_REG(bp), WORD_REG(ax))
190
#if CRYPTOPP_BOOL_X64
191
// save these manually. clobber list doesn't seem to work as of GCC 4.1.0
193
AS1( pushq K_END_REG)
198
AS2( mov K_END_REG, rcx)
204
const word32 *t = Te;
207
AS2( mov edx, g_cacheLineSize)
208
AS2( mov WORD_REG(di), inBlock)
210
AS2( movd mm5, kLoopEnd)
220
AS2( mov eax, [K_REG+0*4]) // s0
221
AS2( xor eax, [WORD_REG(di)+0*4])
223
AS2( mov ebx, [K_REG+1*4])
224
AS2( xor ebx, [WORD_REG(di)+1*4])
227
AS2( mov eax, [K_REG+2*4])
228
AS2( xor eax, [WORD_REG(di)+2*4])
231
AS2( mov ecx, [K_REG+3*4])
232
AS2( xor ecx, [WORD_REG(di)+3*4])
235
// read Te0 into L1 cache. this code could be simplifed by using lfence, but that is an SSE2 instruction
237
AS2( mov edi, ebx) // make index depend on previous loads to simulate lfence
239
AS2( and ebx, [WORD_REG(bp)+WORD_REG(di)])
241
AS2( and ebx, [WORD_REG(bp)+WORD_REG(di)])
243
AS2( and ebx, [WORD_REG(bp)+WORD_REG(di)])
245
AS2( and ebx, [WORD_REG(bp)+WORD_REG(di)])
249
AS2( and ebx, [WORD_REG(bp)+1020])
250
#if CRYPTOPP_BOOL_X64
262
AS2( mov edi, [K_REG+4*4]) // t0
263
AS2( mov eax, [K_REG+5*4])
264
AS2( mov ebx, [K_REG+6*4])
265
AS2( mov edx, [K_REG+7*4])
269
#define QUARTER_ROUND(t, a, b, c, d) \
270
AS2(movzx esi, t##l)\
271
AS2(d, [WORD_REG(bp)+0*1024+4*WORD_REG(si)])\
272
AS2(movzx esi, t##h)\
273
AS2(c, [WORD_REG(bp)+1*1024+4*WORD_REG(si)])\
274
AS2(shr e##t##x, 16)\
275
AS2(movzx esi, t##l)\
276
AS2(b, [WORD_REG(bp)+2*1024+4*WORD_REG(si)])\
277
AS2(movzx esi, t##h)\
278
AS2(a, [WORD_REG(bp)+3*1024+4*WORD_REG(si)])
289
QUARTER_ROUND(c, t0, t1, t2, t3)
291
QUARTER_ROUND(c, t3, t0, t1, t2)
293
QUARTER_ROUND(c, t2, t3, t0, t1)
295
QUARTER_ROUND(c, t1, t2, t3, t0)
304
AS2( mov edi, [K_REG+0*4])
305
AS2( mov eax, [K_REG+1*4])
306
AS2( mov ebx, [K_REG+2*4])
307
AS2( mov ecx, [K_REG+3*4])
309
#define QUARTER_ROUND(t, a, b, c, d) \
310
AS2(movzx esi, t##l)\
311
AS2(a, [WORD_REG(bp)+3*1024+4*WORD_REG(si)])\
312
AS2(movzx esi, t##h)\
313
AS2(b, [WORD_REG(bp)+2*1024+4*WORD_REG(si)])\
314
AS2(shr e##t##x, 16)\
315
AS2(movzx esi, t##l)\
316
AS2(c, [WORD_REG(bp)+1*1024+4*WORD_REG(si)])\
317
AS2(movzx esi, t##h)\
318
AS2(d, [WORD_REG(bp)+0*1024+4*WORD_REG(si)])
320
QUARTER_ROUND(d, s0, s1, s2, s3)
322
QUARTER_ROUND(d, s3, s0, s1, s2)
324
QUARTER_ROUND(d, s2, s3, s0, s1)
326
QUARTER_ROUND(d, s1, s2, s3, s0)
332
AS2( mov edi, [K_REG+4*4])
333
AS2( mov eax, [K_REG+5*4])
334
AS2( mov ebx, [K_REG+6*4])
335
AS2( mov edx, [K_REG+7*4])
337
QUARTER_ROUND(c, t0, t1, t2, t3)
339
QUARTER_ROUND(c, t3, t0, t1, t2)
341
QUARTER_ROUND(c, t2, t3, t0, t1)
343
QUARTER_ROUND(c, t1, t2, t3, t0)
352
AS2( cmp K_END_REG, K_REG)
365
AS2( mov eax, [K_END_REG+0*4])
366
AS2( mov ecx, [K_END_REG+1*4])
367
AS2( mov esi, [K_END_REG+2*4])
368
AS2( mov edi, [K_END_REG+3*4])
370
#define QUARTER_ROUND(a, b, c, d) \
372
AS2( movzx ebx, BYTE PTR [WORD_REG(bp)+1+4*WORD_REG(bx)])\
376
AS2( movzx ebx, BYTE PTR [WORD_REG(bp)+1+4*WORD_REG(bx)])\
382
AS2( movzx ebx, BYTE PTR [WORD_REG(bp)+1+4*WORD_REG(bx)])\
385
AS2( movzx ebx, BYTE PTR [WORD_REG(bp)+1+4*WORD_REG(dx)])\
388
QUARTER_ROUND(eax, ecx, esi, edi)
390
QUARTER_ROUND(edi, eax, ecx, esi)
392
QUARTER_ROUND(esi, edi, eax, ecx)
394
QUARTER_ROUND(ecx, esi, edi, eax)
398
#if CRYPTOPP_BOOL_X64
409
#if defined(__GNUC__) || (defined(_MSC_VER) && _MSC_VER < 1300)
413
".att_syntax prefix;"
414
: "=a" (t0), "=c" (t1), "=S" (t2), "=D" (t3)
415
: "a" (Te), "D" (inBlock), "S" (k), "c" (kLoopEnd), "d" (g_cacheLineSize)
421
t0 ^= ((const word32 *)xorBlock)[0];
422
t1 ^= ((const word32 *)xorBlock)[1];
423
t2 ^= ((const word32 *)xorBlock)[2];
424
t3 ^= ((const word32 *)xorBlock)[3];
426
((word32 *)outBlock)[0] = t0;
427
((word32 *)outBlock)[1] = t1;
428
((word32 *)outBlock)[2] = t2;
429
((word32 *)outBlock)[3] = t3;
431
AS2( mov WORD_REG(bx), xorBlock)
432
AS2( test WORD_REG(bx), WORD_REG(bx))
434
AS2( xor eax, [WORD_REG(bx)+0*4])
435
AS2( xor ecx, [WORD_REG(bx)+1*4])
436
AS2( xor esi, [WORD_REG(bx)+2*4])
437
AS2( xor edi, [WORD_REG(bx)+3*4])
439
AS2( mov WORD_REG(bx), outBlock)
440
AS2( mov [WORD_REG(bx)+0*4], eax)
441
AS2( mov [WORD_REG(bx)+1*4], ecx)
442
AS2( mov [WORD_REG(bx)+2*4], esi)
443
AS2( mov [WORD_REG(bx)+3*4], edi)
447
#endif // #ifdef CRYPTOPP_X86_ASM_AVAILABLE
449
157
word32 s0, s1, s2, s3, t0, t1, t2, t3;
450
158
const word32 *rk = m_key;
462
170
// timing attack countermeasure. see comments at top for more details
463
const int cacheLineSize = GetCacheLineSize();
466
for (i=0; i<1024; i+=cacheLineSize)
467
u &= *(const word32 *)(((const byte *)Te)+i);
173
for (i=0; i<sizeof(Te0)/4; i+=CRYPTOPP_L1_CACHE_LINE_SIZE)
174
u &= (Te0[i+0*s_lineSizeDiv4] & Te0[i+2*s_lineSizeDiv4]) & (Te0[i+1*s_lineSizeDiv4] & Te0[i+3*s_lineSizeDiv4]);
469
175
s0 |= u; s1 |= u; s2 |= u; s3 |= u;
473
#define QUARTER_ROUND(t, a, b, c, d) \
474
a ^= rotrFixed(Te[byte(t)], 24); t >>= 8;\
475
b ^= rotrFixed(Te[byte(t)], 16); t >>= 8;\
476
c ^= rotrFixed(Te[byte(t)], 8); t >>= 8;\
479
#define QUARTER_ROUND(t, a, b, c, d) \
480
d ^= Te[byte(t)]; t >>= 8;\
481
c ^= rotrFixed(Te[byte(t)], 8); t >>= 8;\
482
b ^= rotrFixed(Te[byte(t)], 16); t >>= 8;\
483
a ^= rotrFixed(Te[t], 24);
486
QUARTER_ROUND(s3, t0, t1, t2, t3)
487
QUARTER_ROUND(s2, t3, t0, t1, t2)
488
QUARTER_ROUND(s1, t2, t3, t0, t1)
489
QUARTER_ROUND(s0, t1, t2, t3, t0)
179
Te0[GETBYTE(s0, s_i3)] ^
180
rotrFixed(Te0[GETBYTE(s1, s_i2)], 8) ^
181
rotrFixed(Te0[GETBYTE(s2, s_i1)], 16) ^
182
rotrFixed(Te0[GETBYTE(s3, s_i0)], 24);
184
Te0[GETBYTE(s1, s_i3)] ^
185
rotrFixed(Te0[GETBYTE(s2, s_i2)], 8) ^
186
rotrFixed(Te0[GETBYTE(s3, s_i1)], 16) ^
187
rotrFixed(Te0[GETBYTE(s0, s_i0)], 24);
189
Te0[GETBYTE(s2, s_i3)] ^
190
rotrFixed(Te0[GETBYTE(s3, s_i2)], 8) ^
191
rotrFixed(Te0[GETBYTE(s0, s_i1)], 16) ^
192
rotrFixed(Te0[GETBYTE(s1, s_i0)], 24);
194
Te0[GETBYTE(s3, s_i3)] ^
195
rotrFixed(Te0[GETBYTE(s0, s_i2)], 8) ^
196
rotrFixed(Te0[GETBYTE(s1, s_i1)], 16) ^
197
rotrFixed(Te0[GETBYTE(s2, s_i0)], 24);
492
199
// Nr - 2 full rounds:
493
200
unsigned int r = m_rounds/2 - 1;
496
#define QUARTER_ROUND(t, a, b, c, d) \
497
a ^= Te[3*256+byte(t)]; t >>= 8;\
498
b ^= Te[2*256+byte(t)]; t >>= 8;\
499
c ^= Te[1*256+byte(t)]; t >>= 8;\
502
s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
504
QUARTER_ROUND(t3, s0, s1, s2, s3)
505
QUARTER_ROUND(t2, s3, s0, s1, s2)
506
QUARTER_ROUND(t1, s2, s3, s0, s1)
507
QUARTER_ROUND(t0, s1, s2, s3, s0)
509
t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
511
QUARTER_ROUND(s3, t0, t1, t2, t3)
512
QUARTER_ROUND(s2, t3, t0, t1, t2)
513
QUARTER_ROUND(s1, t2, t3, t0, t1)
514
QUARTER_ROUND(s0, t1, t2, t3, t0)
204
Te0[GETBYTE(t0, 3)] ^
205
Te1[GETBYTE(t1, 2)] ^
206
Te2[GETBYTE(t2, 1)] ^
207
Te3[GETBYTE(t3, 0)] ^
210
Te0[GETBYTE(t1, 3)] ^
211
Te1[GETBYTE(t2, 2)] ^
212
Te2[GETBYTE(t3, 1)] ^
213
Te3[GETBYTE(t0, 0)] ^
216
Te0[GETBYTE(t2, 3)] ^
217
Te1[GETBYTE(t3, 2)] ^
218
Te2[GETBYTE(t0, 1)] ^
219
Te3[GETBYTE(t1, 0)] ^
222
Te0[GETBYTE(t3, 3)] ^
223
Te1[GETBYTE(t0, 2)] ^
224
Te2[GETBYTE(t1, 1)] ^
225
Te3[GETBYTE(t2, 0)] ^
229
Te0[GETBYTE(s0, 3)] ^
230
Te1[GETBYTE(s1, 2)] ^
231
Te2[GETBYTE(s2, 1)] ^
232
Te3[GETBYTE(s3, 0)] ^
235
Te0[GETBYTE(s1, 3)] ^
236
Te1[GETBYTE(s2, 2)] ^
237
Te2[GETBYTE(s3, 1)] ^
238
Te3[GETBYTE(s0, 0)] ^
241
Te0[GETBYTE(s2, 3)] ^
242
Te1[GETBYTE(s3, 2)] ^
243
Te2[GETBYTE(s0, 1)] ^
244
Te3[GETBYTE(s1, 0)] ^
247
Te0[GETBYTE(s3, 3)] ^
248
Te1[GETBYTE(s0, 2)] ^
249
Te2[GETBYTE(s1, 1)] ^
250
Te3[GETBYTE(s2, 0)] ^
520
256
// timing attack countermeasure. see comments at top for more details
522
for (i=0; i<256; i+=cacheLineSize)
523
u &= *(const word32 *)(Se+i);
524
u &= *(const word32 *)(Se+252);
258
for (i=0; i<sizeof(Se)/4; i+=CRYPTOPP_L1_CACHE_LINE_SIZE)
259
u &= (((word32*)Se)[i+0*s_lineSizeDiv4] & ((word32*)Se)[i+2*s_lineSizeDiv4]) & (((word32*)Se)[i+1*s_lineSizeDiv4] & ((word32*)Se)[i+3*s_lineSizeDiv4]);
525
260
t0 |= u; t1 |= u; t2 |= u; t3 |= u;
529
264
word32 *const obw = (word32 *)outBlock;
530
265
const word32 *const xbw = (const word32 *)xorBlock;
532
#define QUARTER_ROUND(t, a, b, c, d) \
533
tempBlock[a] = Se[byte(t)]; t >>= 8;\
534
tempBlock[b] = Se[byte(t)]; t >>= 8;\
535
tempBlock[c] = Se[byte(t)]; t >>= 8;\
536
tempBlock[d] = Se[t];
538
QUARTER_ROUND(t2, 15, 2, 5, 8)
539
QUARTER_ROUND(t1, 11, 14, 1, 4)
540
QUARTER_ROUND(t0, 7, 10, 13, 0)
541
QUARTER_ROUND(t3, 3, 6, 9, 12)
268
tempBlock[0] = Se[GETBYTE(t0, 3)];
269
tempBlock[1] = Se[GETBYTE(t1, 2)];
270
tempBlock[2] = Se[GETBYTE(t2, 1)];
271
tempBlock[3] = Se[GETBYTE(t3, 0)];
272
tempBlock[4] = Se[GETBYTE(t1, 3)];
273
tempBlock[5] = Se[GETBYTE(t2, 2)];
274
tempBlock[6] = Se[GETBYTE(t3, 1)];
275
tempBlock[7] = Se[GETBYTE(t0, 0)];
276
tempBlock[8] = Se[GETBYTE(t2, 3)];
277
tempBlock[9] = Se[GETBYTE(t3, 2)];
278
tempBlock[10] = Se[GETBYTE(t0, 1)];
279
tempBlock[11] = Se[GETBYTE(t1, 0)];
280
tempBlock[12] = Se[GETBYTE(t3, 3)];
281
tempBlock[13] = Se[GETBYTE(t0, 2)];
282
tempBlock[14] = Se[GETBYTE(t1, 1)];
283
tempBlock[15] = Se[GETBYTE(t2, 0)];
576
316
// timing attack countermeasure. see comments at top for more details
577
const int cacheLineSize = GetCacheLineSize();
580
for (i=0; i<1024; i+=cacheLineSize)
581
u &= *(const word32 *)(((const byte *)Td)+i);
319
for (i=0; i<sizeof(Td0)/4; i+=CRYPTOPP_L1_CACHE_LINE_SIZE)
320
u &= (Td0[i+0*s_lineSizeDiv4] & Td0[i+2*s_lineSizeDiv4]) & (Td0[i+1*s_lineSizeDiv4] & Td0[i+3*s_lineSizeDiv4]);
583
321
s0 |= u; s1 |= u; s2 |= u; s3 |= u;
587
#define QUARTER_ROUND(t, a, b, c, d) \
588
a ^= rotrFixed(Td[byte(t)], 24); t >>= 8;\
589
b ^= rotrFixed(Td[byte(t)], 16); t >>= 8;\
590
c ^= rotrFixed(Td[byte(t)], 8); t >>= 8;\
593
#define QUARTER_ROUND(t, a, b, c, d) \
594
d ^= Td[byte(t)]; t >>= 8;\
595
c ^= rotrFixed(Td[byte(t)], 8); t >>= 8;\
596
b ^= rotrFixed(Td[byte(t)], 16); t >>= 8;\
597
a ^= rotrFixed(Td[t], 24);
600
QUARTER_ROUND(s3, t2, t1, t0, t3)
601
QUARTER_ROUND(s2, t1, t0, t3, t2)
602
QUARTER_ROUND(s1, t0, t3, t2, t1)
603
QUARTER_ROUND(s0, t3, t2, t1, t0)
325
Td0[GETBYTE(s0, s_i3)] ^
326
rotrFixed(Td0[GETBYTE(s3, s_i2)], 8) ^
327
rotrFixed(Td0[GETBYTE(s2, s_i1)], 16) ^
328
rotrFixed(Td0[GETBYTE(s1, s_i0)], 24);
330
Td0[GETBYTE(s1, s_i3)] ^
331
rotrFixed(Td0[GETBYTE(s0, s_i2)], 8) ^
332
rotrFixed(Td0[GETBYTE(s3, s_i1)], 16) ^
333
rotrFixed(Td0[GETBYTE(s2, s_i0)], 24);
335
Td0[GETBYTE(s2, s_i3)] ^
336
rotrFixed(Td0[GETBYTE(s1, s_i2)], 8) ^
337
rotrFixed(Td0[GETBYTE(s0, s_i1)], 16) ^
338
rotrFixed(Td0[GETBYTE(s3, s_i0)], 24);
340
Td0[GETBYTE(s3, s_i3)] ^
341
rotrFixed(Td0[GETBYTE(s2, s_i2)], 8) ^
342
rotrFixed(Td0[GETBYTE(s1, s_i1)], 16) ^
343
rotrFixed(Td0[GETBYTE(s0, s_i0)], 24);
606
345
// Nr - 2 full rounds:
607
346
unsigned int r = m_rounds/2 - 1;
610
#define QUARTER_ROUND(t, a, b, c, d) \
611
a ^= Td[3*256+byte(t)]; t >>= 8;\
612
b ^= Td[2*256+byte(t)]; t >>= 8;\
613
c ^= Td[1*256+byte(t)]; t >>= 8;\
616
s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
618
QUARTER_ROUND(t3, s2, s1, s0, s3)
619
QUARTER_ROUND(t2, s1, s0, s3, s2)
620
QUARTER_ROUND(t1, s0, s3, s2, s1)
621
QUARTER_ROUND(t0, s3, s2, s1, s0)
623
t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
625
QUARTER_ROUND(s3, t2, t1, t0, t3)
626
QUARTER_ROUND(s2, t1, t0, t3, t2)
627
QUARTER_ROUND(s1, t0, t3, t2, t1)
628
QUARTER_ROUND(s0, t3, t2, t1, t0)
350
Td0[GETBYTE(t0, 3)] ^
351
Td1[GETBYTE(t3, 2)] ^
352
Td2[GETBYTE(t2, 1)] ^
353
Td3[GETBYTE(t1, 0)] ^
356
Td0[GETBYTE(t1, 3)] ^
357
Td1[GETBYTE(t0, 2)] ^
358
Td2[GETBYTE(t3, 1)] ^
359
Td3[GETBYTE(t2, 0)] ^
362
Td0[GETBYTE(t2, 3)] ^
363
Td1[GETBYTE(t1, 2)] ^
364
Td2[GETBYTE(t0, 1)] ^
365
Td3[GETBYTE(t3, 0)] ^
368
Td0[GETBYTE(t3, 3)] ^
369
Td1[GETBYTE(t2, 2)] ^
370
Td2[GETBYTE(t1, 1)] ^
371
Td3[GETBYTE(t0, 0)] ^
375
Td0[GETBYTE(s0, 3)] ^
376
Td1[GETBYTE(s3, 2)] ^
377
Td2[GETBYTE(s2, 1)] ^
378
Td3[GETBYTE(s1, 0)] ^
381
Td0[GETBYTE(s1, 3)] ^
382
Td1[GETBYTE(s0, 2)] ^
383
Td2[GETBYTE(s3, 1)] ^
384
Td3[GETBYTE(s2, 0)] ^
387
Td0[GETBYTE(s2, 3)] ^
388
Td1[GETBYTE(s1, 2)] ^
389
Td2[GETBYTE(s0, 1)] ^
390
Td3[GETBYTE(s3, 0)] ^
393
Td0[GETBYTE(s3, 3)] ^
394
Td1[GETBYTE(s2, 2)] ^
395
Td2[GETBYTE(s1, 1)] ^
396
Td3[GETBYTE(s0, 0)] ^
634
402
// timing attack countermeasure. see comments at top for more details
636
for (i=0; i<256; i+=cacheLineSize)
637
u &= *(const word32 *)(Sd+i);
638
u &= *(const word32 *)(Sd+252);
404
for (i=0; i<sizeof(Sd)/4; i+=CRYPTOPP_L1_CACHE_LINE_SIZE)
405
u &= (((word32*)Sd)[i+0*s_lineSizeDiv4] & ((word32*)Sd)[i+2*s_lineSizeDiv4]) & (((word32*)Sd)[i+1*s_lineSizeDiv4] & ((word32*)Sd)[i+3*s_lineSizeDiv4]);
639
406
t0 |= u; t1 |= u; t2 |= u; t3 |= u;
643
410
word32 *const obw = (word32 *)outBlock;
644
411
const word32 *const xbw = (const word32 *)xorBlock;
646
#define QUARTER_ROUND(t, a, b, c, d) \
647
tempBlock[a] = Sd[byte(t)]; t >>= 8;\
648
tempBlock[b] = Sd[byte(t)]; t >>= 8;\
649
tempBlock[c] = Sd[byte(t)]; t >>= 8;\
650
tempBlock[d] = Sd[t];
652
QUARTER_ROUND(t2, 7, 2, 13, 8)
653
QUARTER_ROUND(t1, 3, 14, 9, 4)
654
QUARTER_ROUND(t0, 15, 10, 5, 0)
655
QUARTER_ROUND(t3, 11, 6, 1, 12)
414
tempBlock[0] = Sd[GETBYTE(t0, 3)];
415
tempBlock[1] = Sd[GETBYTE(t3, 2)];
416
tempBlock[2] = Sd[GETBYTE(t2, 1)];
417
tempBlock[3] = Sd[GETBYTE(t1, 0)];
418
tempBlock[4] = Sd[GETBYTE(t1, 3)];
419
tempBlock[5] = Sd[GETBYTE(t0, 2)];
420
tempBlock[6] = Sd[GETBYTE(t3, 1)];
421
tempBlock[7] = Sd[GETBYTE(t2, 0)];
422
tempBlock[8] = Sd[GETBYTE(t2, 3)];
423
tempBlock[9] = Sd[GETBYTE(t1, 2)];
424
tempBlock[10] = Sd[GETBYTE(t0, 1)];
425
tempBlock[11] = Sd[GETBYTE(t3, 0)];
426
tempBlock[12] = Sd[GETBYTE(t3, 3)];
427
tempBlock[13] = Sd[GETBYTE(t2, 2)];
428
tempBlock[14] = Sd[GETBYTE(t1, 1)];
429
tempBlock[15] = Sd[GETBYTE(t0, 0)];