1
/* AesOpt.c -- Intel's AES
2
2009-11-23 : Igor Pavlov : Public domain */
6
#ifdef MY_CPU_X86_OR_AMD64
14
#include <wmmintrin.h>
16
void MY_FAST_CALL AesCbc_Encode_Intel(__m128i *p, __m128i *data, size_t numBlocks)
19
for (; numBlocks != 0; numBlocks--, data++)
21
UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1;
22
const __m128i *w = p + 3;
23
m = _mm_xor_si128(m, *data);
24
m = _mm_xor_si128(m, p[2]);
27
m = _mm_aesenc_si128(m, w[0]);
28
m = _mm_aesenc_si128(m, w[1]);
31
while (--numRounds2 != 0);
32
m = _mm_aesenc_si128(m, w[0]);
33
m = _mm_aesenclast_si128(m, w[1]);
41
#define AES_OP_W(op, n) { \
42
const __m128i t = w[n]; \
48
#define AES_DEC(n) AES_OP_W(_mm_aesdec_si128, n)
49
#define AES_DEC_LAST(n) AES_OP_W(_mm_aesdeclast_si128, n)
50
#define AES_ENC(n) AES_OP_W(_mm_aesenc_si128, n)
51
#define AES_ENC_LAST(n) AES_OP_W(_mm_aesenclast_si128, n)
53
void MY_FAST_CALL AesCbc_Decode_Intel(__m128i *p, __m128i *data, size_t numBlocks)
56
for (; numBlocks >= NUM_WAYS; numBlocks -= NUM_WAYS, data += NUM_WAYS)
58
UInt32 numRounds2 = *(const UInt32 *)(p + 1);
59
const __m128i *w = p + numRounds2 * 2;
62
const __m128i t = w[2];
63
m0 = _mm_xor_si128(t, data[0]);
64
m1 = _mm_xor_si128(t, data[1]);
65
m2 = _mm_xor_si128(t, data[2]);
74
while (--numRounds2 != 0);
80
t = _mm_xor_si128(m0, iv); iv = data[0]; data[0] = t;
81
t = _mm_xor_si128(m1, iv); iv = data[1]; data[1] = t;
82
t = _mm_xor_si128(m2, iv); iv = data[2]; data[2] = t;
85
for (; numBlocks != 0; numBlocks--, data++)
87
UInt32 numRounds2 = *(const UInt32 *)(p + 1);
88
const __m128i *w = p + numRounds2 * 2;
89
__m128i m = _mm_xor_si128(w[2], *data);
93
m = _mm_aesdec_si128(m, w[1]);
94
m = _mm_aesdec_si128(m, w[0]);
97
while (--numRounds2 != 0);
98
m = _mm_aesdec_si128(m, w[1]);
99
m = _mm_aesdeclast_si128(m, w[0]);
101
m = _mm_xor_si128(m, iv);
108
void MY_FAST_CALL AesCtr_Code_Intel(__m128i *p, __m128i *data, size_t numBlocks)
112
one.m128i_u64[0] = 1;
113
one.m128i_u64[1] = 0;
114
for (; numBlocks >= NUM_WAYS; numBlocks -= NUM_WAYS, data += NUM_WAYS)
116
UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1;
117
const __m128i *w = p;
120
const __m128i t = w[2];
121
ctr = _mm_add_epi64(ctr, one); m0 = _mm_xor_si128(ctr, t);
122
ctr = _mm_add_epi64(ctr, one); m1 = _mm_xor_si128(ctr, t);
123
ctr = _mm_add_epi64(ctr, one); m2 = _mm_xor_si128(ctr, t);
132
while (--numRounds2 != 0);
135
data[0] = _mm_xor_si128(data[0], m0);
136
data[1] = _mm_xor_si128(data[1], m1);
137
data[2] = _mm_xor_si128(data[2], m2);
139
for (; numBlocks != 0; numBlocks--, data++)
141
UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1;
142
const __m128i *w = p;
144
ctr = _mm_add_epi64(ctr, one);
145
m = _mm_xor_si128(ctr, p[2]);
149
m = _mm_aesenc_si128(m, w[0]);
150
m = _mm_aesenc_si128(m, w[1]);
153
while (--numRounds2 != 0);
154
m = _mm_aesenc_si128(m, w[0]);
155
m = _mm_aesenclast_si128(m, w[1]);
156
*data = _mm_xor_si128(*data, m);
163
void MY_FAST_CALL AesCbc_Encode(UInt32 *ivAes, Byte *data, size_t numBlocks);
164
void MY_FAST_CALL AesCbc_Decode(UInt32 *ivAes, Byte *data, size_t numBlocks);
165
void MY_FAST_CALL AesCtr_Code(UInt32 *ivAes, Byte *data, size_t numBlocks);
167
void MY_FAST_CALL AesCbc_Encode_Intel(UInt32 *p, Byte *data, size_t numBlocks)
169
AesCbc_Encode(p, data, numBlocks);
172
void MY_FAST_CALL AesCbc_Decode_Intel(UInt32 *p, Byte *data, size_t numBlocks)
174
AesCbc_Decode(p, data, numBlocks);
177
void MY_FAST_CALL AesCtr_Code_Intel(UInt32 *p, Byte *data, size_t numBlocks)
179
AesCtr_Code(p, data, numBlocks);