7
7
***************************************************************************/
11
#include <wmmintrin.h>
10
14
static byte S[256],S5[256],rcon[30];
11
15
static byte T1[256][4],T2[256][4],T3[256][4],T4[256][4];
12
16
static byte T5[256][4],T6[256][4],T7[256][4],T8[256][4];
13
17
static byte U1[256][4],U2[256][4],U3[256][4],U4[256][4];
16
inline void Xor128(byte *dest,const byte *arg1,const byte *arg2)
20
inline void Xor128(void *dest,const void *arg1,const void *arg2)
18
#if defined(PRESENT_INT32) && defined(ALLOW_NOT_ALIGNED_INT)
22
#if defined(PRESENT_INT32) && defined(ALLOW_MISALIGNED)
19
23
((uint32*)dest)[0]=((uint32*)arg1)[0]^((uint32*)arg2)[0];
20
24
((uint32*)dest)[1]=((uint32*)arg1)[1]^((uint32*)arg2)[1];
21
25
((uint32*)dest)[2]=((uint32*)arg1)[2]^((uint32*)arg2)[2];
22
26
((uint32*)dest)[3]=((uint32*)arg1)[3]^((uint32*)arg2)[3];
24
28
for (int I=0;I<16;I++)
25
dest[I]=arg1[I]^arg2[I];
29
((byte*)dest)[I]=((byte*)arg1)[I]^((byte*)arg2)[I];
102
size_t Rijndael::blockDecrypt(const byte *input, size_t inputLen, byte *outBuffer)
118
void Rijndael::blockDecrypt(const byte *input, size_t inputLen, byte *outBuffer)
104
if (input == 0 || inputLen <= 0)
123
size_t numBlocks=inputLen/16;
127
blockDecryptSSE(input,numBlocks,outBuffer);
107
132
byte block[16], iv[4][4];
108
133
memcpy(iv,m_initVector,16);
110
size_t numBlocks=inputLen/16;
111
135
for (size_t i = numBlocks; i > 0; i--)
113
decrypt(input, block);
114
Xor128(block,block,(byte*)iv);
116
memcpy(iv, input, 16);
117
memcpy(outBuf, block, 16);
139
Xor128(temp,input,m_expandedKey[m_uRounds]);
141
Xor128(block, T5[temp[0][0]],T6[temp[3][1]],T7[temp[2][2]],T8[temp[1][3]]);
142
Xor128(block+4, T5[temp[1][0]],T6[temp[0][1]],T7[temp[3][2]],T8[temp[2][3]]);
143
Xor128(block+8, T5[temp[2][0]],T6[temp[1][1]],T7[temp[0][2]],T8[temp[3][3]]);
144
Xor128(block+12,T5[temp[3][0]],T6[temp[2][1]],T7[temp[1][2]],T8[temp[0][3]]);
146
for(int r = m_uRounds-1; r > 1; r--)
148
Xor128(temp,block,m_expandedKey[r]);
149
Xor128(block, T5[temp[0][0]],T6[temp[3][1]],T7[temp[2][2]],T8[temp[1][3]]);
150
Xor128(block+4, T5[temp[1][0]],T6[temp[0][1]],T7[temp[3][2]],T8[temp[2][3]]);
151
Xor128(block+8, T5[temp[2][0]],T6[temp[1][1]],T7[temp[0][2]],T8[temp[3][3]]);
152
Xor128(block+12,T5[temp[3][0]],T6[temp[2][1]],T7[temp[1][2]],T8[temp[0][3]]);
155
Xor128(temp,block,m_expandedKey[1]);
156
block[ 0] = S5[temp[0][0]];
157
block[ 1] = S5[temp[3][1]];
158
block[ 2] = S5[temp[2][2]];
159
block[ 3] = S5[temp[1][3]];
160
block[ 4] = S5[temp[1][0]];
161
block[ 5] = S5[temp[0][1]];
162
block[ 6] = S5[temp[3][2]];
163
block[ 7] = S5[temp[2][3]];
164
block[ 8] = S5[temp[2][0]];
165
block[ 9] = S5[temp[1][1]];
166
block[10] = S5[temp[0][2]];
167
block[11] = S5[temp[3][3]];
168
block[12] = S5[temp[3][0]];
169
block[13] = S5[temp[2][1]];
170
block[14] = S5[temp[1][2]];
171
block[15] = S5[temp[0][3]];
172
Xor128(block,block,m_expandedKey[0]);
175
Xor128(block,block,iv);
119
177
Copy128((byte*)iv,input);
120
178
Copy128(outBuffer,block);
126
184
memcpy(m_initVector,iv,16);
189
void Rijndael::blockDecryptSSE(const byte *input, size_t numBlocks, byte *outBuffer)
191
__m128i initVector = _mm_loadu_si128((__m128i*)m_initVector);
192
__m128i *src=(__m128i*)input;
193
__m128i *dest=(__m128i*)outBuffer;
194
__m128i *rkey=(__m128i*)m_expandedKey;
195
while (numBlocks > 0)
197
__m128i rl = _mm_loadu_si128(rkey + m_uRounds);
198
__m128i d = _mm_loadu_si128(src++);
199
__m128i v = _mm_xor_si128(rl, d);
201
for (int i=m_uRounds-1; i>0; i--)
203
__m128i ri = _mm_loadu_si128(rkey + i);
204
v = _mm_aesdec_si128(v, ri);
207
__m128i r0 = _mm_loadu_si128(rkey);
208
v = _mm_aesdeclast_si128(v, r0);
211
v = _mm_xor_si128(v, initVector);
213
_mm_storeu_si128(dest++,v);
216
_mm_storeu_si128((__m128i*)m_initVector,initVector);
132
221
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
224
void Rijndael::decrypt(const byte a[16], byte b[16])
229
Xor128((byte*)temp,(byte*)a,(byte*)m_expandedKey[m_uRounds]);
231
Xor128(b, T5[temp[0][0]],T6[temp[3][1]],T7[temp[2][2]],T8[temp[1][3]]);
232
Xor128(b+4, T5[temp[1][0]],T6[temp[0][1]],T7[temp[3][2]],T8[temp[2][3]]);
233
Xor128(b+8, T5[temp[2][0]],T6[temp[1][1]],T7[temp[0][2]],T8[temp[3][3]]);
234
Xor128(b+12,T5[temp[3][0]],T6[temp[2][1]],T7[temp[1][2]],T8[temp[0][3]]);
236
for(r = m_uRounds-1; r > 1; r--)
238
Xor128((byte*)temp,(byte*)b,(byte*)m_expandedKey[r]);
239
Xor128(b, T5[temp[0][0]],T6[temp[3][1]],T7[temp[2][2]],T8[temp[1][3]]);
240
Xor128(b+4, T5[temp[1][0]],T6[temp[0][1]],T7[temp[3][2]],T8[temp[2][3]]);
241
Xor128(b+8, T5[temp[2][0]],T6[temp[1][1]],T7[temp[0][2]],T8[temp[3][3]]);
242
Xor128(b+12,T5[temp[3][0]],T6[temp[2][1]],T7[temp[1][2]],T8[temp[0][3]]);
245
Xor128((byte*)temp,(byte*)b,(byte*)m_expandedKey[1]);
246
b[ 0] = S5[temp[0][0]];
247
b[ 1] = S5[temp[3][1]];
248
b[ 2] = S5[temp[2][2]];
249
b[ 3] = S5[temp[1][3]];
250
b[ 4] = S5[temp[1][0]];
251
b[ 5] = S5[temp[0][1]];
252
b[ 6] = S5[temp[3][2]];
253
b[ 7] = S5[temp[2][3]];
254
b[ 8] = S5[temp[2][0]];
255
b[ 9] = S5[temp[1][1]];
256
b[10] = S5[temp[0][2]];
257
b[11] = S5[temp[3][3]];
258
b[12] = S5[temp[3][0]];
259
b[13] = S5[temp[2][1]];
260
b[14] = S5[temp[1][2]];
261
b[15] = S5[temp[0][3]];
262
Xor128((byte*)b,(byte*)b,(byte*)m_expandedKey[0]);
265
313
#define ff_poly 0x011b
266
314
#define ff_hi 0x80