~ubuntu-branches/debian/sid/botan/sid

« back to all changes in this revision

Viewing changes to src/lib/block/aes/aes_ssse3/aes_ssse3.cpp

  • Committer: Package Import Robot
  • Author(s): Laszlo Boszormenyi (GCS)
  • Date: 2018-03-01 22:23:25 UTC
  • mfrom: (1.2.2)
  • Revision ID: package-import@ubuntu.com-20180301222325-7p7vc45gu3hta34d
Tags: 2.4.0-2
* Don't remove .doctrees from the manual if it doesn't exist.
* Don't specify parallel to debhelper.

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
/*
 
2
* AES using SSSE3
 
3
* (C) 2010,2016 Jack Lloyd
 
4
*
 
5
* This is more or less a direct translation of public domain x86-64
 
6
* assembly written by Mike Hamburg, described in "Accelerating AES
 
7
* with Vector Permute Instructions" (CHES 2009). His original code is
 
8
* available at https://crypto.stanford.edu/vpaes/
 
9
*
 
10
* Botan is released under the Simplified BSD License (see license.txt)
 
11
*/
 
12
 
 
13
#include <botan/aes.h>
 
14
#include <botan/internal/ct_utils.h>
 
15
#include <tmmintrin.h>
 
16
 
 
17
namespace Botan {
 
18
 
 
19
namespace {
 
20
 
 
21
const __m128i low_nibs = _mm_set1_epi8(0x0F);
 
22
 
 
23
const __m128i k_ipt1 = _mm_set_epi32(
 
24
   0xCABAE090, 0x52227808, 0xC2B2E898, 0x5A2A7000);
 
25
const __m128i k_ipt2 = _mm_set_epi32(
 
26
   0xCD80B1FC, 0xB0FDCC81, 0x4C01307D, 0x317C4D00);
 
27
 
 
28
const __m128i k_inv1 = _mm_set_epi32(
 
29
   0x04070309, 0x0A0B0C02, 0x0E05060F, 0x0D080180);
 
30
const __m128i k_inv2 = _mm_set_epi32(
 
31
   0x030D0E0C, 0x02050809, 0x01040A06, 0x0F0B0780);
 
32
 
 
33
const __m128i sb1u = _mm_set_epi32(
 
34
   0xA5DF7A6E, 0x142AF544, 0xB19BE18F, 0xCB503E00);
 
35
const __m128i sb1t = _mm_set_epi32(
 
36
   0x3BF7CCC1, 0x0D2ED9EF, 0x3618D415, 0xFAE22300);
 
37
 
 
38
const __m128i mc_forward[4] = {
 
39
   _mm_set_epi32(0x0C0F0E0D, 0x080B0A09, 0x04070605, 0x00030201),
 
40
   _mm_set_epi32(0x00030201, 0x0C0F0E0D, 0x080B0A09, 0x04070605),
 
41
   _mm_set_epi32(0x04070605, 0x00030201, 0x0C0F0E0D, 0x080B0A09),
 
42
   _mm_set_epi32(0x080B0A09, 0x04070605, 0x00030201, 0x0C0F0E0D)
 
43
};
 
44
 
 
45
const __m128i sr[4] = {
 
46
   _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100),
 
47
   _mm_set_epi32(0x0B06010C, 0x07020D08, 0x030E0904, 0x0F0A0500),
 
48
   _mm_set_epi32(0x070E050C, 0x030A0108, 0x0F060D04, 0x0B020900),
 
49
   _mm_set_epi32(0x0306090C, 0x0F020508, 0x0B0E0104, 0x070A0D00),
 
50
};
 
51
 
 
52
#define mm_xor3(x, y, z) _mm_xor_si128(x, _mm_xor_si128(y, z))
 
53
 
 
54
BOTAN_FUNC_ISA("ssse3")
 
55
__m128i aes_schedule_transform(__m128i input,
 
56
                               __m128i table_1,
 
57
                               __m128i table_2)
 
58
   {
 
59
   __m128i i_1 = _mm_and_si128(low_nibs, input);
 
60
   __m128i i_2 = _mm_srli_epi32(_mm_andnot_si128(low_nibs, input), 4);
 
61
 
 
62
   return _mm_xor_si128(
 
63
      _mm_shuffle_epi8(table_1, i_1),
 
64
      _mm_shuffle_epi8(table_2, i_2));
 
65
   }
 
66
 
 
67
BOTAN_FUNC_ISA("ssse3")
 
68
__m128i aes_schedule_mangle(__m128i k, uint8_t round_no)
 
69
   {
 
70
   __m128i t = _mm_shuffle_epi8(_mm_xor_si128(k, _mm_set1_epi8(0x5B)),
 
71
                                mc_forward[0]);
 
72
 
 
73
   __m128i t2 = t;
 
74
 
 
75
   t = _mm_shuffle_epi8(t, mc_forward[0]);
 
76
 
 
77
   t2 = mm_xor3(t2, t, _mm_shuffle_epi8(t, mc_forward[0]));
 
78
 
 
79
   return _mm_shuffle_epi8(t2, sr[round_no % 4]);
 
80
   }
 
81
 
 
82
BOTAN_FUNC_ISA("ssse3")
 
83
__m128i aes_schedule_192_smear(__m128i x, __m128i y)
 
84
   {
 
85
   return mm_xor3(y,
 
86
                  _mm_shuffle_epi32(x, 0xFE),
 
87
                  _mm_shuffle_epi32(y, 0x80));
 
88
   }
 
89
 
 
90
BOTAN_FUNC_ISA("ssse3")
 
91
__m128i aes_schedule_mangle_dec(__m128i k, uint8_t round_no)
 
92
   {
 
93
   const __m128i dsk[8] = {
 
94
      _mm_set_epi32(0x4AED9334, 0x82255BFC, 0xB6116FC8, 0x7ED9A700),
 
95
      _mm_set_epi32(0x8BB89FAC, 0xE9DAFDCE, 0x45765162, 0x27143300),
 
96
      _mm_set_epi32(0x4622EE8A, 0xADC90561, 0x27438FEB, 0xCCA86400),
 
97
      _mm_set_epi32(0x73AEE13C, 0xBD602FF2, 0x815C13CE, 0x4F92DD00),
 
98
      _mm_set_epi32(0xF83F3EF9, 0xFA3D3CFB, 0x03C4C502, 0x01C6C700),
 
99
      _mm_set_epi32(0xA5526A9D, 0x7384BC4B, 0xEE1921D6, 0x38CFF700),
 
100
      _mm_set_epi32(0xA080D3F3, 0x10306343, 0xE3C390B0, 0x53732000),
 
101
      _mm_set_epi32(0x2F45AEC4, 0x8CE60D67, 0xA0CA214B, 0x036982E8)
 
102
   };
 
103
 
 
104
   __m128i t = aes_schedule_transform(k, dsk[0], dsk[1]);
 
105
   __m128i output = _mm_shuffle_epi8(t, mc_forward[0]);
 
106
 
 
107
   t = aes_schedule_transform(t, dsk[2], dsk[3]);
 
108
   output = _mm_shuffle_epi8(_mm_xor_si128(t, output), mc_forward[0]);
 
109
 
 
110
   t = aes_schedule_transform(t, dsk[4], dsk[5]);
 
111
   output = _mm_shuffle_epi8(_mm_xor_si128(t, output), mc_forward[0]);
 
112
 
 
113
   t = aes_schedule_transform(t, dsk[6], dsk[7]);
 
114
   output = _mm_shuffle_epi8(_mm_xor_si128(t, output), mc_forward[0]);
 
115
 
 
116
   return _mm_shuffle_epi8(output, sr[round_no % 4]);
 
117
   }
 
118
 
 
119
BOTAN_FUNC_ISA("ssse3")
 
120
__m128i aes_schedule_mangle_last(__m128i k, uint8_t round_no)
 
121
   {
 
122
   const __m128i out_tr1 = _mm_set_epi32(
 
123
      0xF7974121, 0xDEBE6808, 0xFF9F4929, 0xD6B66000);
 
124
   const __m128i out_tr2 = _mm_set_epi32(
 
125
      0xE10D5DB1, 0xB05C0CE0, 0x01EDBD51, 0x50BCEC00);
 
126
 
 
127
   k = _mm_shuffle_epi8(k, sr[round_no % 4]);
 
128
   k = _mm_xor_si128(k, _mm_set1_epi8(0x5B));
 
129
   return aes_schedule_transform(k, out_tr1, out_tr2);
 
130
   }
 
131
 
 
132
BOTAN_FUNC_ISA("ssse3")
 
133
__m128i aes_schedule_mangle_last_dec(__m128i k)
 
134
   {
 
135
   const __m128i deskew1 = _mm_set_epi32(
 
136
      0x1DFEB95A, 0x5DBEF91A, 0x07E4A340, 0x47A4E300);
 
137
   const __m128i deskew2 = _mm_set_epi32(
 
138
      0x2841C2AB, 0xF49D1E77, 0x5F36B5DC, 0x83EA6900);
 
139
 
 
140
   k = _mm_xor_si128(k, _mm_set1_epi8(0x5B));
 
141
   return aes_schedule_transform(k, deskew1, deskew2);
 
142
   }
 
143
 
 
144
BOTAN_FUNC_ISA("ssse3")
 
145
__m128i aes_schedule_round(__m128i* rcon, __m128i input1, __m128i input2)
 
146
   {
 
147
   if(rcon)
 
148
      {
 
149
      input2 = _mm_xor_si128(_mm_alignr_epi8(_mm_setzero_si128(), *rcon, 15),
 
150
                             input2);
 
151
 
 
152
      *rcon = _mm_alignr_epi8(*rcon, *rcon, 15); // next rcon
 
153
 
 
154
      input1 = _mm_shuffle_epi32(input1, 0xFF); // rotate
 
155
      input1 = _mm_alignr_epi8(input1, input1, 1);
 
156
      }
 
157
 
 
158
   __m128i smeared = _mm_xor_si128(input2, _mm_slli_si128(input2, 4));
 
159
   smeared = mm_xor3(smeared, _mm_slli_si128(smeared, 8), _mm_set1_epi8(0x5B));
 
160
 
 
161
   __m128i t = _mm_srli_epi32(_mm_andnot_si128(low_nibs, input1), 4);
 
162
 
 
163
   input1 = _mm_and_si128(low_nibs, input1);
 
164
 
 
165
   __m128i t2 = _mm_shuffle_epi8(k_inv2, input1);
 
166
 
 
167
   input1 = _mm_xor_si128(input1, t);
 
168
 
 
169
   __m128i t3 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, t));
 
170
   __m128i t4 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, input1));
 
171
 
 
172
   __m128i t5 = _mm_xor_si128(input1, _mm_shuffle_epi8(k_inv1, t3));
 
173
   __m128i t6 = _mm_xor_si128(t, _mm_shuffle_epi8(k_inv1, t4));
 
174
 
 
175
   return mm_xor3(_mm_shuffle_epi8(sb1u, t5),
 
176
                  _mm_shuffle_epi8(sb1t, t6),
 
177
                  smeared);
 
178
   }
 
179
 
 
180
BOTAN_FUNC_ISA("ssse3")
 
181
__m128i aes_ssse3_encrypt(__m128i B, const __m128i* keys, size_t rounds)
 
182
   {
 
183
   const __m128i sb2u = _mm_set_epi32(
 
184
      0x5EB7E955, 0xBC982FCD, 0xE27A93C6, 0x0B712400);
 
185
   const __m128i sb2t = _mm_set_epi32(
 
186
      0xC2A163C8, 0xAB82234A, 0x69EB8840, 0x0AE12900);
 
187
 
 
188
   const __m128i sbou = _mm_set_epi32(
 
189
      0x15AABF7A, 0xC502A878, 0xD0D26D17, 0x6FBDC700);
 
190
   const __m128i sbot = _mm_set_epi32(
 
191
      0x8E1E90D1, 0x412B35FA, 0xCFE474A5, 0x5FBB6A00);
 
192
 
 
193
   const __m128i mc_backward[4] = {
 
194
      _mm_set_epi32(0x0E0D0C0F, 0x0A09080B, 0x06050407, 0x02010003),
 
195
      _mm_set_epi32(0x0A09080B, 0x06050407, 0x02010003, 0x0E0D0C0F),
 
196
      _mm_set_epi32(0x06050407, 0x02010003, 0x0E0D0C0F, 0x0A09080B),
 
197
      _mm_set_epi32(0x02010003, 0x0E0D0C0F, 0x0A09080B, 0x06050407),
 
198
   };
 
199
 
 
200
   B = mm_xor3(_mm_shuffle_epi8(k_ipt1, _mm_and_si128(low_nibs, B)),
 
201
               _mm_shuffle_epi8(k_ipt2,
 
202
                                _mm_srli_epi32(
 
203
                                   _mm_andnot_si128(low_nibs, B),
 
204
                                   4)),
 
205
               _mm_loadu_si128(keys));
 
206
 
 
207
   for(size_t r = 1; ; ++r)
 
208
      {
 
209
      const __m128i K = _mm_loadu_si128(keys + r);
 
210
 
 
211
      __m128i t = _mm_srli_epi32(_mm_andnot_si128(low_nibs, B), 4);
 
212
 
 
213
      B = _mm_and_si128(low_nibs, B);
 
214
 
 
215
      __m128i t2 = _mm_shuffle_epi8(k_inv2, B);
 
216
 
 
217
      B = _mm_xor_si128(B, t);
 
218
 
 
219
      __m128i t3 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, t));
 
220
      __m128i t4 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, B));
 
221
 
 
222
      __m128i t5 = _mm_xor_si128(B, _mm_shuffle_epi8(k_inv1, t3));
 
223
      __m128i t6 = _mm_xor_si128(t, _mm_shuffle_epi8(k_inv1, t4));
 
224
 
 
225
      if(r == rounds)
 
226
         {
 
227
         B = _mm_shuffle_epi8(
 
228
            mm_xor3(_mm_shuffle_epi8(sbou, t5),
 
229
                    _mm_shuffle_epi8(sbot, t6),
 
230
                    K),
 
231
            sr[r % 4]);
 
232
 
 
233
         return B;
 
234
         }
 
235
 
 
236
      __m128i t7 = mm_xor3(_mm_shuffle_epi8(sb1t, t6),
 
237
                           _mm_shuffle_epi8(sb1u, t5),
 
238
                           K);
 
239
 
 
240
      __m128i t8 = mm_xor3(_mm_shuffle_epi8(sb2t, t6),
 
241
                           _mm_shuffle_epi8(sb2u, t5),
 
242
                           _mm_shuffle_epi8(t7, mc_forward[r % 4]));
 
243
 
 
244
      B = mm_xor3(_mm_shuffle_epi8(t8, mc_forward[r % 4]),
 
245
                  _mm_shuffle_epi8(t7, mc_backward[r % 4]),
 
246
                  t8);
 
247
      }
 
248
   }
 
249
 
 
250
BOTAN_FUNC_ISA("ssse3")
 
251
__m128i aes_ssse3_decrypt(__m128i B, const __m128i* keys, size_t rounds)
 
252
   {
 
253
   const __m128i k_dipt1 = _mm_set_epi32(
 
254
      0x154A411E, 0x114E451A, 0x0F505B04, 0x0B545F00);
 
255
   const __m128i k_dipt2 = _mm_set_epi32(
 
256
      0x12771772, 0xF491F194, 0x86E383E6, 0x60056500);
 
257
 
 
258
   const __m128i sb9u = _mm_set_epi32(
 
259
      0xCAD51F50, 0x4F994CC9, 0x851C0353, 0x9A86D600);
 
260
   const __m128i sb9t = _mm_set_epi32(
 
261
      0x725E2C9E, 0xB2FBA565, 0xC03B1789, 0xECD74900);
 
262
 
 
263
   const __m128i sbeu = _mm_set_epi32(
 
264
      0x22426004, 0x64B4F6B0, 0x46F29296, 0x26D4D000);
 
265
   const __m128i sbet = _mm_set_epi32(
 
266
      0x9467F36B, 0x98593E32, 0x0C55A6CD, 0xFFAAC100);
 
267
 
 
268
   const __m128i sbdu = _mm_set_epi32(
 
269
      0xF56E9B13, 0x882A4439, 0x7D57CCDF, 0xE6B1A200);
 
270
   const __m128i sbdt = _mm_set_epi32(
 
271
      0x2931180D, 0x15DEEFD3, 0x3CE2FAF7, 0x24C6CB00);
 
272
 
 
273
   const __m128i sbbu = _mm_set_epi32(
 
274
      0x602646F6, 0xB0F2D404, 0xD0226492, 0x96B44200);
 
275
   const __m128i sbbt = _mm_set_epi32(
 
276
      0xF3FF0C3E, 0x3255AA6B, 0xC19498A6, 0xCD596700);
 
277
 
 
278
   __m128i mc = mc_forward[3];
 
279
 
 
280
   __m128i t =
 
281
      _mm_shuffle_epi8(k_dipt2,
 
282
                       _mm_srli_epi32(
 
283
                          _mm_andnot_si128(low_nibs, B),
 
284
                          4));
 
285
 
 
286
   B = mm_xor3(t, _mm_loadu_si128(keys),
 
287
               _mm_shuffle_epi8(k_dipt1, _mm_and_si128(B, low_nibs)));
 
288
 
 
289
   for(size_t r = 1; ; ++r)
 
290
      {
 
291
      const __m128i K = _mm_loadu_si128(keys + r);
 
292
 
 
293
      t = _mm_srli_epi32(_mm_andnot_si128(low_nibs, B), 4);
 
294
 
 
295
      B = _mm_and_si128(low_nibs, B);
 
296
 
 
297
      __m128i t2 = _mm_shuffle_epi8(k_inv2, B);
 
298
 
 
299
      B = _mm_xor_si128(B, t);
 
300
 
 
301
      __m128i t3 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, t));
 
302
      __m128i t4 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, B));
 
303
      __m128i t5 = _mm_xor_si128(B, _mm_shuffle_epi8(k_inv1, t3));
 
304
      __m128i t6 = _mm_xor_si128(t, _mm_shuffle_epi8(k_inv1, t4));
 
305
 
 
306
      if(r == rounds)
 
307
         {
 
308
         const __m128i sbou = _mm_set_epi32(
 
309
            0xC7AA6DB9, 0xD4943E2D, 0x1387EA53, 0x7EF94000);
 
310
         const __m128i sbot = _mm_set_epi32(
 
311
            0xCA4B8159, 0xD8C58E9C, 0x12D7560F, 0x93441D00);
 
312
 
 
313
         __m128i x = _mm_shuffle_epi8(sbou, t5);
 
314
         __m128i y = _mm_shuffle_epi8(sbot, t6);
 
315
         x = _mm_xor_si128(x, K);
 
316
         x = _mm_xor_si128(x, y);
 
317
 
 
318
         const uint32_t which_sr = ((((rounds - 1) << 4) ^ 48) & 48) / 16;
 
319
         return _mm_shuffle_epi8(x, sr[which_sr]);
 
320
         }
 
321
 
 
322
      __m128i t8 = _mm_xor_si128(_mm_shuffle_epi8(sb9t, t6),
 
323
                                 _mm_xor_si128(_mm_shuffle_epi8(sb9u, t5), K));
 
324
 
 
325
      __m128i t9 = mm_xor3(_mm_shuffle_epi8(t8, mc),
 
326
                           _mm_shuffle_epi8(sbdu, t5),
 
327
                           _mm_shuffle_epi8(sbdt, t6));
 
328
 
 
329
      __m128i t12 = _mm_xor_si128(
 
330
         _mm_xor_si128(
 
331
            _mm_shuffle_epi8(t9, mc),
 
332
            _mm_shuffle_epi8(sbbu, t5)),
 
333
         _mm_shuffle_epi8(sbbt, t6));
 
334
 
 
335
      B = _mm_xor_si128(_mm_xor_si128(_mm_shuffle_epi8(t12, mc),
 
336
                                      _mm_shuffle_epi8(sbeu, t5)),
 
337
                        _mm_shuffle_epi8(sbet, t6));
 
338
 
 
339
      mc = _mm_alignr_epi8(mc, mc, 12);
 
340
      }
 
341
   }
 
342
 
 
343
}
 
344
 
 
345
/*
 
346
* AES-128 Encryption
 
347
*/
 
348
BOTAN_FUNC_ISA("ssse3")
 
349
void AES_128::ssse3_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
 
350
   {
 
351
   const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
 
352
   __m128i* out_mm = reinterpret_cast<__m128i*>(out);
 
353
 
 
354
   const __m128i* keys = reinterpret_cast<const __m128i*>(m_EK.data());
 
355
 
 
356
   CT::poison(in, blocks * block_size());
 
357
 
 
358
   BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks; ++i)
 
359
      {
 
360
      __m128i B = _mm_loadu_si128(in_mm + i);
 
361
      _mm_storeu_si128(out_mm + i, aes_ssse3_encrypt(B, keys, 10));
 
362
      }
 
363
 
 
364
   CT::unpoison(in,  blocks * block_size());
 
365
   CT::unpoison(out, blocks * block_size());
 
366
   }
 
367
 
 
368
/*
 
369
* AES-128 Decryption
 
370
*/
 
371
BOTAN_FUNC_ISA("ssse3")
 
372
void AES_128::ssse3_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
 
373
   {
 
374
   const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
 
375
   __m128i* out_mm = reinterpret_cast<__m128i*>(out);
 
376
 
 
377
   const __m128i* keys = reinterpret_cast<const __m128i*>(m_DK.data());
 
378
 
 
379
   CT::poison(in, blocks * block_size());
 
380
 
 
381
   BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks; ++i)
 
382
      {
 
383
      __m128i B = _mm_loadu_si128(in_mm + i);
 
384
      _mm_storeu_si128(out_mm + i, aes_ssse3_decrypt(B, keys, 10));
 
385
      }
 
386
 
 
387
   CT::unpoison(in,  blocks * block_size());
 
388
   CT::unpoison(out, blocks * block_size());
 
389
   }
 
390
 
 
391
/*
 
392
* AES-128 Key Schedule
 
393
*/
 
394
BOTAN_FUNC_ISA("ssse3")
 
395
void AES_128::ssse3_key_schedule(const uint8_t keyb[], size_t)
 
396
   {
 
397
   __m128i rcon = _mm_set_epi32(0x702A9808, 0x4D7C7D81,
 
398
                                0x1F8391B9, 0xAF9DEEB6);
 
399
 
 
400
   __m128i key = _mm_loadu_si128(reinterpret_cast<const __m128i*>(keyb));
 
401
 
 
402
   m_EK.resize(11*4);
 
403
   m_DK.resize(11*4);
 
404
 
 
405
   __m128i* EK_mm = reinterpret_cast<__m128i*>(m_EK.data());
 
406
   __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data());
 
407
 
 
408
   _mm_storeu_si128(DK_mm + 10, _mm_shuffle_epi8(key, sr[2]));
 
409
 
 
410
   key = aes_schedule_transform(key, k_ipt1, k_ipt2);
 
411
 
 
412
   _mm_storeu_si128(EK_mm, key);
 
413
 
 
414
   for(size_t i = 1; i != 10; ++i)
 
415
      {
 
416
      key = aes_schedule_round(&rcon, key, key);
 
417
 
 
418
      _mm_storeu_si128(EK_mm + i,
 
419
                       aes_schedule_mangle(key, (12-i) % 4));
 
420
 
 
421
      _mm_storeu_si128(DK_mm + (10-i),
 
422
                       aes_schedule_mangle_dec(key, (10-i) % 4));
 
423
      }
 
424
 
 
425
   key = aes_schedule_round(&rcon, key, key);
 
426
   _mm_storeu_si128(EK_mm + 10, aes_schedule_mangle_last(key, 2));
 
427
   _mm_storeu_si128(DK_mm, aes_schedule_mangle_last_dec(key));
 
428
   }
 
429
 
 
430
/*
 
431
* AES-192 Encryption
 
432
*/
 
433
BOTAN_FUNC_ISA("ssse3")
 
434
void AES_192::ssse3_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
 
435
   {
 
436
   const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
 
437
   __m128i* out_mm = reinterpret_cast<__m128i*>(out);
 
438
 
 
439
   const __m128i* keys = reinterpret_cast<const __m128i*>(m_EK.data());
 
440
 
 
441
   CT::poison(in, blocks * block_size());
 
442
 
 
443
   for(size_t i = 0; i != blocks; ++i)
 
444
      {
 
445
      __m128i B = _mm_loadu_si128(in_mm + i);
 
446
      _mm_storeu_si128(out_mm + i, aes_ssse3_encrypt(B, keys, 12));
 
447
      }
 
448
 
 
449
   CT::unpoison(in,  blocks * block_size());
 
450
   CT::unpoison(out, blocks * block_size());
 
451
   }
 
452
 
 
453
/*
 
454
* AES-192 Decryption
 
455
*/
 
456
BOTAN_FUNC_ISA("ssse3")
 
457
void AES_192::ssse3_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
 
458
   {
 
459
   const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
 
460
   __m128i* out_mm = reinterpret_cast<__m128i*>(out);
 
461
 
 
462
   const __m128i* keys = reinterpret_cast<const __m128i*>(m_DK.data());
 
463
 
 
464
   CT::poison(in, blocks * block_size());
 
465
 
 
466
   for(size_t i = 0; i != blocks; ++i)
 
467
      {
 
468
      __m128i B = _mm_loadu_si128(in_mm + i);
 
469
      _mm_storeu_si128(out_mm + i, aes_ssse3_decrypt(B, keys, 12));
 
470
      }
 
471
 
 
472
   CT::unpoison(in,  blocks * block_size());
 
473
   CT::unpoison(out, blocks * block_size());
 
474
   }
 
475
 
 
476
/*
 
477
* AES-192 Key Schedule
 
478
*/
 
479
BOTAN_FUNC_ISA("ssse3")
 
480
void AES_192::ssse3_key_schedule(const uint8_t keyb[], size_t)
 
481
   {
 
482
   __m128i rcon = _mm_set_epi32(0x702A9808, 0x4D7C7D81,
 
483
                                0x1F8391B9, 0xAF9DEEB6);
 
484
 
 
485
   m_EK.resize(13*4);
 
486
   m_DK.resize(13*4);
 
487
 
 
488
   __m128i* EK_mm = reinterpret_cast<__m128i*>(m_EK.data());
 
489
   __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data());
 
490
 
 
491
   __m128i key1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(keyb));
 
492
   __m128i key2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>((keyb + 8)));
 
493
 
 
494
   _mm_storeu_si128(DK_mm + 12, _mm_shuffle_epi8(key1, sr[0]));
 
495
 
 
496
   key1 = aes_schedule_transform(key1, k_ipt1, k_ipt2);
 
497
   key2 = aes_schedule_transform(key2, k_ipt1, k_ipt2);
 
498
 
 
499
   _mm_storeu_si128(EK_mm + 0, key1);
 
500
 
 
501
   // key2 with 8 high bytes masked off
 
502
   __m128i t = _mm_slli_si128(_mm_srli_si128(key2, 8), 8);
 
503
 
 
504
   for(size_t i = 0; i != 4; ++i)
 
505
      {
 
506
      key2 = aes_schedule_round(&rcon, key2, key1);
 
507
 
 
508
      _mm_storeu_si128(EK_mm + 3*i+1,
 
509
                       aes_schedule_mangle(_mm_alignr_epi8(key2, t, 8), (i+3)%4));
 
510
      _mm_storeu_si128(DK_mm + 11-3*i,
 
511
                       aes_schedule_mangle_dec(_mm_alignr_epi8(key2, t, 8), (i+3)%4));
 
512
 
 
513
      t = aes_schedule_192_smear(key2, t);
 
514
 
 
515
      _mm_storeu_si128(EK_mm + 3*i+2,
 
516
                       aes_schedule_mangle(t, (i+2)%4));
 
517
      _mm_storeu_si128(DK_mm + 10-3*i,
 
518
                       aes_schedule_mangle_dec(t, (i+2)%4));
 
519
 
 
520
      key2 = aes_schedule_round(&rcon, t, key2);
 
521
 
 
522
      if(i == 3)
 
523
         {
 
524
         _mm_storeu_si128(EK_mm + 3*i+3,
 
525
                          aes_schedule_mangle_last(key2, (i+1)%4));
 
526
         _mm_storeu_si128(DK_mm + 9-3*i,
 
527
                          aes_schedule_mangle_last_dec(key2));
 
528
         }
 
529
      else
 
530
         {
 
531
         _mm_storeu_si128(EK_mm + 3*i+3,
 
532
                          aes_schedule_mangle(key2, (i+1)%4));
 
533
         _mm_storeu_si128(DK_mm + 9-3*i,
 
534
                          aes_schedule_mangle_dec(key2, (i+1)%4));
 
535
         }
 
536
 
 
537
      key1 = key2;
 
538
      key2 = aes_schedule_192_smear(key2,
 
539
                                    _mm_slli_si128(_mm_srli_si128(t, 8), 8));
 
540
      t = _mm_slli_si128(_mm_srli_si128(key2, 8), 8);
 
541
      }
 
542
   }
 
543
 
 
544
/*
 
545
* AES-256 Encryption
 
546
*/
 
547
BOTAN_FUNC_ISA("ssse3")
 
548
void AES_256::ssse3_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
 
549
   {
 
550
   const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
 
551
   __m128i* out_mm = reinterpret_cast<__m128i*>(out);
 
552
 
 
553
   const __m128i* keys = reinterpret_cast<const __m128i*>(m_EK.data());
 
554
 
 
555
   CT::poison(in, blocks * block_size());
 
556
 
 
557
   for(size_t i = 0; i != blocks; ++i)
 
558
      {
 
559
      __m128i B = _mm_loadu_si128(in_mm + i);
 
560
      _mm_storeu_si128(out_mm + i, aes_ssse3_encrypt(B, keys, 14));
 
561
      }
 
562
 
 
563
   CT::unpoison(in,  blocks * block_size());
 
564
   CT::unpoison(out, blocks * block_size());
 
565
   }
 
566
 
 
567
/*
 
568
* AES-256 Decryption
 
569
*/
 
570
BOTAN_FUNC_ISA("ssse3")
 
571
void AES_256::ssse3_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
 
572
   {
 
573
   const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
 
574
   __m128i* out_mm = reinterpret_cast<__m128i*>(out);
 
575
 
 
576
   const __m128i* keys = reinterpret_cast<const __m128i*>(m_DK.data());
 
577
 
 
578
   CT::poison(in, blocks * block_size());
 
579
 
 
580
   for(size_t i = 0; i != blocks; ++i)
 
581
      {
 
582
      __m128i B = _mm_loadu_si128(in_mm + i);
 
583
      _mm_storeu_si128(out_mm + i, aes_ssse3_decrypt(B, keys, 14));
 
584
      }
 
585
 
 
586
   CT::unpoison(in,  blocks * block_size());
 
587
   CT::unpoison(out, blocks * block_size());
 
588
   }
 
589
 
 
590
/*
 
591
* AES-256 Key Schedule
 
592
*/
 
593
BOTAN_FUNC_ISA("ssse3")
 
594
void AES_256::ssse3_key_schedule(const uint8_t keyb[], size_t)
 
595
   {
 
596
   __m128i rcon = _mm_set_epi32(0x702A9808, 0x4D7C7D81,
 
597
                                0x1F8391B9, 0xAF9DEEB6);
 
598
 
 
599
   m_EK.resize(15*4);
 
600
   m_DK.resize(15*4);
 
601
 
 
602
   __m128i* EK_mm = reinterpret_cast<__m128i*>(m_EK.data());
 
603
   __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data());
 
604
 
 
605
   __m128i key1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(keyb));
 
606
   __m128i key2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>((keyb + 16)));
 
607
 
 
608
   _mm_storeu_si128(DK_mm + 14, _mm_shuffle_epi8(key1, sr[2]));
 
609
 
 
610
   key1 = aes_schedule_transform(key1, k_ipt1, k_ipt2);
 
611
   key2 = aes_schedule_transform(key2, k_ipt1, k_ipt2);
 
612
 
 
613
   _mm_storeu_si128(EK_mm + 0, key1);
 
614
   _mm_storeu_si128(EK_mm + 1, aes_schedule_mangle(key2, 3));
 
615
 
 
616
   _mm_storeu_si128(DK_mm + 13, aes_schedule_mangle_dec(key2, 1));
 
617
 
 
618
   for(size_t i = 2; i != 14; i += 2)
 
619
      {
 
620
      __m128i k_t = key2;
 
621
      key1 = key2 = aes_schedule_round(&rcon, key2, key1);
 
622
 
 
623
      _mm_storeu_si128(EK_mm + i, aes_schedule_mangle(key2, i % 4));
 
624
      _mm_storeu_si128(DK_mm + (14-i), aes_schedule_mangle_dec(key2, (i+2) % 4));
 
625
 
 
626
      key2 = aes_schedule_round(nullptr, _mm_shuffle_epi32(key2, 0xFF), k_t);
 
627
      _mm_storeu_si128(EK_mm + i + 1, aes_schedule_mangle(key2, (i - 1) % 4));
 
628
      _mm_storeu_si128(DK_mm + (13-i), aes_schedule_mangle_dec(key2, (i+1) % 4));
 
629
      }
 
630
 
 
631
   key2 = aes_schedule_round(&rcon, key2, key1);
 
632
 
 
633
   _mm_storeu_si128(EK_mm + 14, aes_schedule_mangle_last(key2, 2));
 
634
   _mm_storeu_si128(DK_mm + 0, aes_schedule_mangle_last_dec(key2));
 
635
   }
 
636
 
 
637
}