1
/*********************************************************************
2
* Copyright (c) 2016 Pieter Wuille *
3
* Distributed under the MIT software license, see the accompanying *
4
* file COPYING or http://www.opensource.org/licenses/mit-license.php.*
5
**********************************************************************/
7
/* Constant time, unoptimized, concise, plain C, AES implementation
9
* Emilia Kasper and Peter Schwabe, Faster and Timing-Attack Resistant AES-GCM
10
* http://www.iacr.org/archive/ches2009/57470001/57470001.pdf
11
* But using 8 16-bit integers representing a single AES state rather than 8 128-bit
12
* integers representing 8 AES states.
17
/* Slice variable slice_i contains the i'th bit of the 16 state variables in this order:
24
/** Convert a byte to sliced form, storing it corresponding to given row and column in s */
25
static void LoadByte(AES_state* s, unsigned char byte, int r, int c) {
27
for (i = 0; i < 8; i++) {
28
s->slice[i] |= (byte & 1) << (r * 4 + c);
33
/** Load 16 bytes of data into 8 sliced integers */
34
static void LoadBytes(AES_state *s, const unsigned char* data16) {
36
for (c = 0; c < 4; c++) {
38
for (r = 0; r < 4; r++) {
39
LoadByte(s, *(data16++), r, c);
44
/** Convert 8 sliced integers into 16 bytes of data */
45
static void SaveBytes(unsigned char* data16, const AES_state *s) {
47
for (c = 0; c < 4; c++) {
49
for (r = 0; r < 4; r++) {
52
for (b = 0; b < 8; b++) {
53
v |= ((s->slice[b] >> (r * 4 + c)) & 1) << b;
60
/* S-box implementation based on the gate logic from:
61
* Joan Boyar and Rene Peralta, A depth-16 circuit for the AES S-box.
62
* https://eprint.iacr.org/2011/332.pdf
64
static void SubBytes(AES_state *s, int inv) {
65
/* Load the bit slices */
66
uint16_t U0 = s->slice[7], U1 = s->slice[6], U2 = s->slice[5], U3 = s->slice[4];
67
uint16_t U4 = s->slice[3], U5 = s->slice[2], U6 = s->slice[1], U7 = s->slice[0];
69
uint16_t T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16;
70
uint16_t T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, D;
71
uint16_t M1, M6, M11, M13, M15, M20, M21, M22, M23, M25, M37, M38, M39, M40;
72
uint16_t M41, M42, M43, M44, M45, M46, M47, M48, M49, M50, M51, M52, M53, M54;
73
uint16_t M55, M56, M57, M58, M59, M60, M61, M62, M63;
76
uint16_t R5, R13, R17, R18, R19;
77
/* Undo linear postprocessing */
106
/* Linear preprocessing. */
137
/* Non-linear transformation (identical to the code in SubBytes) */
141
M13 = (T4 & T27) ^ M11;
142
M15 = (T2 & T10) ^ M11;
143
M20 = T14 ^ M1 ^ (T23 & T8) ^ M13;
144
M21 = (T19 & D) ^ M1 ^ T24 ^ M15;
145
M22 = T26 ^ M6 ^ (T22 & T9) ^ M13;
146
M23 = (T20 & T17) ^ M6 ^ M15 ^ T25;
148
M37 = M21 ^ ((M20 ^ M21) & (M23 ^ M25));
149
M38 = M20 ^ M25 ^ (M21 | (M20 & M23));
150
M39 = M23 ^ ((M22 ^ M23) & (M21 ^ M25));
151
M40 = M22 ^ M25 ^ (M23 | (M21 & M22));
177
/* Undo linear preprocessing */
178
uint16_t P0 = M52 ^ M61;
179
uint16_t P1 = M58 ^ M59;
180
uint16_t P2 = M54 ^ M62;
181
uint16_t P3 = M47 ^ M50;
182
uint16_t P4 = M48 ^ M56;
183
uint16_t P5 = M46 ^ M51;
184
uint16_t P6 = M49 ^ M60;
185
uint16_t P7 = P0 ^ P1;
186
uint16_t P8 = M50 ^ M53;
187
uint16_t P9 = M55 ^ M63;
188
uint16_t P10 = M57 ^ P4;
189
uint16_t P11 = P0 ^ P3;
190
uint16_t P12 = M46 ^ M48;
191
uint16_t P13 = M49 ^ M51;
192
uint16_t P14 = M49 ^ M62;
193
uint16_t P15 = M54 ^ M59;
194
uint16_t P16 = M57 ^ M61;
195
uint16_t P17 = M58 ^ P2;
196
uint16_t P18 = M63 ^ P5;
197
uint16_t P19 = P2 ^ P3;
198
uint16_t P20 = P4 ^ P6;
199
uint16_t P22 = P2 ^ P7;
200
uint16_t P23 = P7 ^ P8;
201
uint16_t P24 = P5 ^ P7;
202
uint16_t P25 = P6 ^ P10;
203
uint16_t P26 = P9 ^ P11;
204
uint16_t P27 = P10 ^ P18;
205
uint16_t P28 = P11 ^ P25;
206
uint16_t P29 = P15 ^ P20;
207
s->slice[7] = P13 ^ P22;
208
s->slice[6] = P26 ^ P29;
209
s->slice[5] = P17 ^ P28;
210
s->slice[4] = P12 ^ P22;
211
s->slice[3] = P23 ^ P27;
212
s->slice[2] = P19 ^ P24;
213
s->slice[1] = P14 ^ P23;
214
s->slice[0] = P9 ^ P16;
216
/* Linear postprocessing */
217
uint16_t L0 = M61 ^ M62;
218
uint16_t L1 = M50 ^ M56;
219
uint16_t L2 = M46 ^ M48;
220
uint16_t L3 = M47 ^ M55;
221
uint16_t L4 = M54 ^ M58;
222
uint16_t L5 = M49 ^ M61;
223
uint16_t L6 = M62 ^ L5;
224
uint16_t L7 = M46 ^ L3;
225
uint16_t L8 = M51 ^ M59;
226
uint16_t L9 = M52 ^ M53;
227
uint16_t L10 = M53 ^ L4;
228
uint16_t L11 = M60 ^ L2;
229
uint16_t L12 = M48 ^ M51;
230
uint16_t L13 = M50 ^ L0;
231
uint16_t L14 = M52 ^ M61;
232
uint16_t L15 = M55 ^ L1;
233
uint16_t L16 = M56 ^ L0;
234
uint16_t L17 = M57 ^ L1;
235
uint16_t L18 = M58 ^ L8;
236
uint16_t L19 = M63 ^ L4;
237
uint16_t L20 = L0 ^ L1;
238
uint16_t L21 = L1 ^ L7;
239
uint16_t L22 = L3 ^ L12;
240
uint16_t L23 = L18 ^ L2;
241
uint16_t L24 = L15 ^ L9;
242
uint16_t L25 = L6 ^ L10;
243
uint16_t L26 = L7 ^ L9;
244
uint16_t L27 = L8 ^ L10;
245
uint16_t L28 = L11 ^ L14;
246
uint16_t L29 = L11 ^ L17;
247
s->slice[7] = L6 ^ L24;
248
s->slice[6] = ~(L16 ^ L26);
249
s->slice[5] = ~(L19 ^ L28);
250
s->slice[4] = L6 ^ L21;
251
s->slice[3] = L20 ^ L22;
252
s->slice[2] = L25 ^ L29;
253
s->slice[1] = ~(L13 ^ L27);
254
s->slice[0] = ~(L6 ^ L23);
258
#define BIT_RANGE(from,to) (((1 << ((to) - (from))) - 1) << (from))
260
#define BIT_RANGE_LEFT(x,from,to,shift) (((x) & BIT_RANGE((from), (to))) << (shift))
261
#define BIT_RANGE_RIGHT(x,from,to,shift) (((x) & BIT_RANGE((from), (to))) >> (shift))
263
static void ShiftRows(AES_state* s) {
265
for (i = 0; i < 8; i++) {
266
uint16_t v = s->slice[i];
268
(v & BIT_RANGE(0, 4)) |
269
BIT_RANGE_LEFT(v, 4, 5, 3) | BIT_RANGE_RIGHT(v, 5, 8, 1) |
270
BIT_RANGE_LEFT(v, 8, 10, 2) | BIT_RANGE_RIGHT(v, 10, 12, 2) |
271
BIT_RANGE_LEFT(v, 12, 15, 1) | BIT_RANGE_RIGHT(v, 15, 16, 3);
275
static void InvShiftRows(AES_state* s) {
277
for (i = 0; i < 8; i++) {
278
uint16_t v = s->slice[i];
280
(v & BIT_RANGE(0, 4)) |
281
BIT_RANGE_LEFT(v, 4, 7, 1) | BIT_RANGE_RIGHT(v, 7, 8, 3) |
282
BIT_RANGE_LEFT(v, 8, 10, 2) | BIT_RANGE_RIGHT(v, 10, 12, 2) |
283
BIT_RANGE_LEFT(v, 12, 13, 3) | BIT_RANGE_RIGHT(v, 13, 16, 1);
287
#define ROT(x,b) (((x) >> ((b) * 4)) | ((x) << ((4-(b)) * 4)))
289
static void MixColumns(AES_state* s, int inv) {
290
/* The MixColumns transform treats the bytes of the columns of the state as
291
* coefficients of a 3rd degree polynomial over GF(2^8) and multiplies them
292
* by the fixed polynomial a(x) = {03}x^3 + {01}x^2 + {01}x + {02}, modulo
295
* In the inverse transform, we multiply by the inverse of a(x),
296
* a^-1(x) = {0b}x^3 + {0d}x^2 + {09}x + {0e}. This is equal to
297
* a(x) * ({04}x^2 + {05}), so we can reuse the forward transform's code
298
* (found in OpenSSL's bsaes-x86_64.pl, attributed to Jussi Kivilinna)
300
* In the bitsliced representation, a multiplication of every column by x
301
* mod x^4 + 1 is simply a right rotation.
304
/* Shared for both directions is a multiplication by a(x), which can be
305
* rewritten as (x^3 + x^2 + x) + {02}*(x^3 + {01}).
307
* First compute s into the s? variables, (x^3 + {01}) * s into the s?_01
308
* variables and (x^3 + x^2 + x)*s into the s?_123 variables.
310
uint16_t s0 = s->slice[0], s1 = s->slice[1], s2 = s->slice[2], s3 = s->slice[3];
311
uint16_t s4 = s->slice[4], s5 = s->slice[5], s6 = s->slice[6], s7 = s->slice[7];
312
uint16_t s0_01 = s0 ^ ROT(s0, 1), s0_123 = ROT(s0_01, 1) ^ ROT(s0, 3);
313
uint16_t s1_01 = s1 ^ ROT(s1, 1), s1_123 = ROT(s1_01, 1) ^ ROT(s1, 3);
314
uint16_t s2_01 = s2 ^ ROT(s2, 1), s2_123 = ROT(s2_01, 1) ^ ROT(s2, 3);
315
uint16_t s3_01 = s3 ^ ROT(s3, 1), s3_123 = ROT(s3_01, 1) ^ ROT(s3, 3);
316
uint16_t s4_01 = s4 ^ ROT(s4, 1), s4_123 = ROT(s4_01, 1) ^ ROT(s4, 3);
317
uint16_t s5_01 = s5 ^ ROT(s5, 1), s5_123 = ROT(s5_01, 1) ^ ROT(s5, 3);
318
uint16_t s6_01 = s6 ^ ROT(s6, 1), s6_123 = ROT(s6_01, 1) ^ ROT(s6, 3);
319
uint16_t s7_01 = s7 ^ ROT(s7, 1), s7_123 = ROT(s7_01, 1) ^ ROT(s7, 3);
320
/* Now compute s = s?_123 + {02} * s?_01. */
321
s->slice[0] = s7_01 ^ s0_123;
322
s->slice[1] = s7_01 ^ s0_01 ^ s1_123;
323
s->slice[2] = s1_01 ^ s2_123;
324
s->slice[3] = s7_01 ^ s2_01 ^ s3_123;
325
s->slice[4] = s7_01 ^ s3_01 ^ s4_123;
326
s->slice[5] = s4_01 ^ s5_123;
327
s->slice[6] = s5_01 ^ s6_123;
328
s->slice[7] = s6_01 ^ s7_123;
330
/* In the reverse direction, we further need to multiply by
331
* {04}x^2 + {05}, which can be written as {04} * (x^2 + {01}) + {01}.
333
* First compute (x^2 + {01}) * s into the t?_02 variables: */
334
uint16_t t0_02 = s->slice[0] ^ ROT(s->slice[0], 2);
335
uint16_t t1_02 = s->slice[1] ^ ROT(s->slice[1], 2);
336
uint16_t t2_02 = s->slice[2] ^ ROT(s->slice[2], 2);
337
uint16_t t3_02 = s->slice[3] ^ ROT(s->slice[3], 2);
338
uint16_t t4_02 = s->slice[4] ^ ROT(s->slice[4], 2);
339
uint16_t t5_02 = s->slice[5] ^ ROT(s->slice[5], 2);
340
uint16_t t6_02 = s->slice[6] ^ ROT(s->slice[6], 2);
341
uint16_t t7_02 = s->slice[7] ^ ROT(s->slice[7], 2);
342
/* And then update s += {04} * t?_02 */
343
s->slice[0] ^= t6_02;
344
s->slice[1] ^= t6_02 ^ t7_02;
345
s->slice[2] ^= t0_02 ^ t7_02;
346
s->slice[3] ^= t1_02 ^ t6_02;
347
s->slice[4] ^= t2_02 ^ t6_02 ^ t7_02;
348
s->slice[5] ^= t3_02 ^ t7_02;
349
s->slice[6] ^= t4_02;
350
s->slice[7] ^= t5_02;
354
static void AddRoundKey(AES_state* s, const AES_state* round) {
356
for (b = 0; b < 8; b++) {
357
s->slice[b] ^= round->slice[b];
361
/** column_0(s) = column_c(a) */
362
static void GetOneColumn(AES_state* s, const AES_state* a, int c) {
364
for (b = 0; b < 8; b++) {
365
s->slice[b] = (a->slice[b] >> c) & 0x1111;
369
/** column_c1(r) |= (column_0(s) ^= column_c2(a)) */
370
static void KeySetupColumnMix(AES_state* s, AES_state* r, const AES_state* a, int c1, int c2) {
372
for (b = 0; b < 8; b++) {
373
r->slice[b] |= ((s->slice[b] ^= ((a->slice[b] >> c2) & 0x1111)) & 0x1111) << c1;
377
/** Rotate the rows in s one position upwards, and xor in r */
378
static void KeySetupTransform(AES_state* s, const AES_state* r) {
380
for (b = 0; b < 8; b++) {
381
s->slice[b] = ((s->slice[b] >> 4) | (s->slice[b] << 12)) ^ r->slice[b];
385
/* Multiply the cells in s by x, as polynomials over GF(2) mod x^8 + x^4 + x^3 + x + 1 */
386
static void MultX(AES_state* s) {
387
uint16_t top = s->slice[7];
388
s->slice[7] = s->slice[6];
389
s->slice[6] = s->slice[5];
390
s->slice[5] = s->slice[4];
391
s->slice[4] = s->slice[3] ^ top;
392
s->slice[3] = s->slice[2] ^ top;
393
s->slice[2] = s->slice[1];
394
s->slice[1] = s->slice[0] ^ top;
398
/** Expand the cipher key into the key schedule.
400
* state must be a pointer to an array of size nrounds + 1.
401
* key must be a pointer to 4 * nkeywords bytes.
403
* AES128 uses nkeywords = 4, nrounds = 10
404
* AES192 uses nkeywords = 6, nrounds = 12
405
* AES256 uses nkeywords = 8, nrounds = 14
407
static void AES_setup(AES_state* rounds, const uint8_t* key, int nkeywords, int nrounds)
411
/* The one-byte round constant */
412
AES_state rcon = {{1,0,0,0,0,0,0,0}};
413
/* The number of the word being generated, modulo nkeywords */
415
/* The column representing the word currently being processed */
418
for (i = 0; i < nrounds + 1; i++) {
420
for (b = 0; b < 8; b++) {
421
rounds[i].slice[b] = 0;
425
/* The first nkeywords round columns are just taken from the key directly. */
426
for (i = 0; i < nkeywords; i++) {
428
for (r = 0; r < 4; r++) {
429
LoadByte(&rounds[i >> 2], *(key++), r, i & 3);
433
GetOneColumn(&column, &rounds[(nkeywords - 1) >> 2], (nkeywords - 1) & 3);
435
for (i = nkeywords; i < 4 * (nrounds + 1); i++) {
436
/* Transform column */
438
SubBytes(&column, 0);
439
KeySetupTransform(&column, &rcon);
441
} else if (nkeywords > 6 && pos == 4) {
442
SubBytes(&column, 0);
444
if (++pos == nkeywords) pos = 0;
445
KeySetupColumnMix(&column, &rounds[i >> 2], &rounds[(i - nkeywords) >> 2], i & 3, (i - nkeywords) & 3);
449
static void AES_encrypt(const AES_state* rounds, int nrounds, unsigned char* cipher16, const unsigned char* plain16) {
453
LoadBytes(&s, plain16);
454
AddRoundKey(&s, rounds++);
456
for (round = 1; round < nrounds; round++) {
460
AddRoundKey(&s, rounds++);
465
AddRoundKey(&s, rounds);
467
SaveBytes(cipher16, &s);
470
static void AES_decrypt(const AES_state* rounds, int nrounds, unsigned char* plain16, const unsigned char* cipher16) {
471
/* Most AES decryption implementations use the alternate scheme
472
* (the Equivalent Inverse Cipher), which looks more like encryption, but
473
* needs different round constants. We can't reuse any code here anyway, so
480
LoadBytes(&s, cipher16);
481
AddRoundKey(&s, rounds--);
483
for (round = 1; round < nrounds; round++) {
486
AddRoundKey(&s, rounds--);
492
AddRoundKey(&s, rounds);
494
SaveBytes(plain16, &s);
497
void AES128_init(AES128_ctx* ctx, const unsigned char* key16) {
498
AES_setup(ctx->rk, key16, 4, 10);
501
void AES128_encrypt(const AES128_ctx* ctx, size_t blocks, unsigned char* cipher16, const unsigned char* plain16) {
503
AES_encrypt(ctx->rk, 10, cipher16, plain16);
509
void AES128_decrypt(const AES128_ctx* ctx, size_t blocks, unsigned char* plain16, const unsigned char* cipher16) {
511
AES_decrypt(ctx->rk, 10, plain16, cipher16);
517
void AES192_init(AES192_ctx* ctx, const unsigned char* key24) {
518
AES_setup(ctx->rk, key24, 6, 12);
521
void AES192_encrypt(const AES192_ctx* ctx, size_t blocks, unsigned char* cipher16, const unsigned char* plain16) {
523
AES_encrypt(ctx->rk, 12, cipher16, plain16);
530
void AES192_decrypt(const AES192_ctx* ctx, size_t blocks, unsigned char* plain16, const unsigned char* cipher16) {
532
AES_decrypt(ctx->rk, 12, plain16, cipher16);
538
void AES256_init(AES256_ctx* ctx, const unsigned char* key32) {
539
AES_setup(ctx->rk, key32, 8, 14);
542
void AES256_encrypt(const AES256_ctx* ctx, size_t blocks, unsigned char* cipher16, const unsigned char* plain16) {
544
AES_encrypt(ctx->rk, 14, cipher16, plain16);
550
void AES256_decrypt(const AES256_ctx* ctx, size_t blocks, unsigned char* plain16, const unsigned char* cipher16) {
552
AES_decrypt(ctx->rk, 14, plain16, cipher16);