1
C nettle, low-level cryptographics library
3
C Copyright (C) 2011 Niels Möller
5
C The nettle library is free software; you can redistribute it and/or modify
6
C it under the terms of the GNU Lesser General Public License as published by
7
C the Free Software Foundation; either version 2.1 of the License, or (at your
8
C option) any later version.
10
C The nettle library is distributed in the hope that it will be useful, but
11
C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
12
C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
13
C License for more details.
15
C You should have received a copy of the GNU Lesser General Public License
16
C along with the nettle library; see the file COPYING.LIB. If not, write to
17
C the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
20
include_src(<x86_64/serpent.m4>)
24
C Single block serpent state, two copies
35
C Quadruple block serpent state, two copies
46
define(<MINUS1>, <%xmm8>)
48
define(<T1>, <%xmm10>)
49
define(<T2>, <%xmm11>)
50
define(<T3>, <%xmm12>)
59
define(<TMP32>, <%r14d>)
61
C SBOX macros. Inputs $1 - $4 (destroyed), outputs $5 - $8
64
mov $2, $8 C y3 = x1 ^ x2
66
mov $1, $5 C y0 = x0 | x3
68
mov $1, $6 C y1 = x0 ^ x1
71
mov $3, $7 C y2 = x2 | y3
77
mov $6, $5 C y0 = y1 & x2
85
mov $5, $6 C y1 = y0 ^ x1
91
mov $1, $6 C y1 = x0 | x3
93
mov $3, $7 C y2 = x2 ^ x3
97
mov $1, $8 C y3 = x0 ^ x2
100
and $4, $8 C y3 &= x3
101
mov $6, $1 C x0 = y1 & y2
104
xor $5, $7 C y2 ^= y0
105
xor $1, $8 C y3 ^= x0
106
mov $6, $1 C x0 = y1 ^ y3
108
xor $7, $1 C x0 ^= y2
109
mov $2, $6 C y1 = x1 & x3
111
xor $1, $6 C y1 ^= x0
112
mov $6, $4 C x3 = y1 | y3
115
and $4, $5 C y0 &= x3
116
xor $3, $5 C y0 ^= x2
120
mov $1, $7 C y2 = x1 | x2
300
C Parallel operation on four blocks at a time.
302
C pnot instruction is missing. For lack of a spare register, XOR with
303
C constant in memory.
310
movdqa $2, $8 C y3 = x1 ^ x2
312
movdqa $1, $5 C y0 = x0 | x3
314
movdqa $1, $6 C y1 = x0 ^ x1
316
pxor $5, $8 C y3 ^= y0
317
movdqa $3, $7 C y2 = x2 | y3
319
pxor $4, $1 C x0 ^= x3
320
pand $4, $7 C y2 &= x3
321
pxor $3, $4 C x3 ^= x2
322
por $2, $3 C x2 |= x1
323
movdqa $6, $5 C y0 = y1 & x2
325
pxor $5, $7 C y2 ^= y0
326
pand $7, $5 C y0 &= y2
327
pxor $3, $5 C y0 ^= x2
328
pand $1, $2 C x1 &= x0
329
pxor $1, $5 C y0 ^= x0
331
movdqa $5, $6 C y1 = y0 ^ x1
333
pxor $4, $6 C y1 ^= x3
337
movdqa $1, $6 C y1 = x0 | x3
339
movdqa $3, $7 C y2 = x2 ^ x3
341
movdqa $2, $5 C y0 = ~x1
343
movdqa $1, $8 C y3 = x0 ^ x2
345
por $1, $5 C y0 |= x0
346
pand $4, $8 C y3 &= x3
347
movdqa $6, $1 C x0 = y1 & y2
349
por $2, $8 C y3 |= x1
350
pxor $5, $7 C y2 ^= y0
351
pxor $1, $8 C y3 ^= x0
352
movdqa $6, $1 C x0 = y1 ^ y3
354
pxor $7, $1 C x0 ^= y2
355
movdqa $2, $6 C y1 = x1 & x3
357
pxor $1, $6 C y1 ^= x0
358
movdqa $6, $4 C x3 = y1 | y3
361
pand $4, $5 C y0 &= x3
362
pxor $3, $5 C y0 ^= x2
366
movdqa $1, $7 C y2 = x1 | x2
502
pandn $1, $6 C t02 implicit
524
C WLT(x0, x1, x2, x3)
546
.file "serpent-encrypt.asm"
548
C serpent_encrypt(struct serpent_context *ctx,
549
C unsigned length, uint8_t *dst,
550
C const uint8_t *src)
553
PROLOGUE(nettle_serpent_encrypt)
554
C save all registers that need to be saved
567
C Point at the final subkey.
573
pcmpeqd MINUS1, MINUS1
577
movups 16(SRC, N), X1
578
movups 32(SRC, N), X2
579
movups 48(SRC, N), X3
581
WTRANSPOSE(X0, X1, X2, X3)
590
WKEYXOR(, X0,X1,X2,X3)
591
WSBOX0(X0,X1,X2,X3, Y0,Y1,Y2,Y3)
594
WKEYXOR(16, Y0,Y1,Y2,Y3)
595
WSBOX1(Y0,Y1,Y2,Y3, X0,X1,X2,X3)
598
WKEYXOR(32, X0,X1,X2,X3)
599
WSBOX2(X0,X1,X2,X3, Y0,Y1,Y2,Y3)
602
WKEYXOR(48, Y0,Y1,Y2,Y3)
603
WSBOX3(Y0,Y1,Y2,Y3, X0,X1,X2,X3)
606
WKEYXOR(64, X0,X1,X2,X3)
607
WSBOX4(X0,X1,X2,X3, Y0,Y1,Y2,Y3)
610
WKEYXOR(80, Y0,Y1,Y2,Y3)
611
WSBOX5(Y0,Y1,Y2,Y3, X0,X1,X2,X3)
614
WKEYXOR(96, X0,X1,X2,X3)
615
WSBOX6(X0,X1,X2,X3, Y0,Y1,Y2,Y3)
618
WKEYXOR(112, Y0,Y1,Y2,Y3)
619
WSBOX7(Y0,Y1,Y2,Y3, X0,X1,X2,X3)
623
C FIXME: CNT known to be zero, no index register needed
624
WKEYXOR(, X0,X1,X2,X3)
626
WTRANSPOSE(X0,X1,X2,X3)
629
movups X1, 16(DST, N)
630
movups X2, 32(DST, N)
631
movups X3, 48(DST, N)
633
C FIXME: Adjust N, so we can use just jnc without an extra cmp.
640
C The single-block loop here is slightly slower than the double-block
641
C loop in serpent-encrypt.c.
643
C FIXME: Should use non-sse2 code only if we have a single block left.
644
C With two or three blocks, it should be better to do them in
664
SBOX0(x0,x1,x2,x3, y0,y1,y2,y3)
671
SBOX1(y0,y1,y2,y3, x0,x1,x2,x3)
678
SBOX2(x0,x1,x2,x3, y0,y1,y2,y3)
685
SBOX3(y0,y1,y2,y3, x0,x1,x2,x3)
692
SBOX4(x0,x1,x2,x3, y0,y1,y2,y3)
699
SBOX5(y0,y1,y2,y3, x0,x1,x2,x3)
703
xor 100(CTX, CNT), x1
704
xor 104(CTX, CNT), x2
705
xor 108(CTX, CNT), x3
706
SBOX6(x0,x1,x2,x3, y0,y1,y2,y3)
709
xor 112(CTX, CNT), y0
710
xor 116(CTX, CNT), y1
711
xor 120(CTX, CNT), y2
712
xor 124(CTX, CNT), y3
713
SBOX7(y0,y1,y2,y3, x0,x1,x2,x3)
717
C Apply final subkey.