1
#if ENABLE_SHA1_HWACCEL && defined(__GNUC__) && defined(__x86_64__)
2
/* The code is adapted from Linux kernel's source */
4
// We use shorter insns, even though they are for "wrong"
5
// data type (fp, not int).
6
// For Intel, there is no penalty for doing it at all
7
// (CPUs which do have such penalty do not support SHA insns).
8
// For AMD, the penalty is one extra cycle
9
// (allegedly: I failed to find measurable difference).
11
//#define mova128 movdqa
12
#define mova128 movaps
13
//#define movu128 movdqu
14
#define movu128 movups
17
//#define shuf128_32 pshufd
18
#define shuf128_32 shufps
20
#define extr128_32 pextrd
21
//#define extr128_32 extractps # not shorter
23
// pshufb is a SSSE3 insn.
24
// pinsrd, pextrd, extractps are SSE4.1 insns.
25
// We do not check SSSE3/SSE4.1 in cpuid,
26
// all SHA-capable CPUs support them as well.
29
.section .note.GNU-stack, "", @progbits
31
.section .text.sha1_process_block64_shaNI, "ax", @progbits
32
.globl sha1_process_block64_shaNI
33
.hidden sha1_process_block64_shaNI
34
.type sha1_process_block64_shaNI, @function
37
#define E0 %xmm1 /* Need two E's b/c they ping pong */
44
.balign 8 # allow decoders to fetch at least 2 first insns
45
sha1_process_block64_shaNI:
46
/* load initial hash values */
47
movu128 80(%rdi), ABCD
49
pinsrd $3, 80+4*4(%rdi), E0 # load to uppermost 32-bit word
50
shuf128_32 $0x1B, ABCD, ABCD # DCBA -> ABCD
52
mova128 PSHUFFLE_BYTE_FLIP_MASK(%rip), %xmm7
54
movu128 0*16(%rdi), MSG0
56
movu128 1*16(%rdi), MSG1
58
movu128 2*16(%rdi), MSG2
60
movu128 3*16(%rdi), MSG3
63
/* Save hash values for addition after rounds */
70
sha1rnds4 $0, E0, ABCD
75
sha1rnds4 $0, E1, ABCD
81
sha1rnds4 $0, E0, ABCD
89
sha1rnds4 $0, E1, ABCD
97
sha1rnds4 $0, E0, ABCD
105
sha1rnds4 $1, E1, ABCD
113
sha1rnds4 $1, E0, ABCD
121
sha1rnds4 $1, E1, ABCD
129
sha1rnds4 $1, E0, ABCD
137
sha1rnds4 $1, E1, ABCD
145
sha1rnds4 $2, E0, ABCD
153
sha1rnds4 $2, E1, ABCD
161
sha1rnds4 $2, E0, ABCD
169
sha1rnds4 $2, E1, ABCD
177
sha1rnds4 $2, E0, ABCD
185
sha1rnds4 $3, E1, ABCD
193
sha1rnds4 $3, E0, ABCD
201
sha1rnds4 $3, E1, ABCD
208
sha1rnds4 $3, E0, ABCD
213
sha1rnds4 $3, E1, ABCD
215
/* Add current hash values with previously saved */
219
/* Write hash values back in the correct order */
220
shuf128_32 $0x1B, ABCD, ABCD
221
movu128 ABCD, 80(%rdi)
222
extr128_32 $3, E0, 80+4*4(%rdi)
225
.size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI
227
.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
229
PSHUFFLE_BYTE_FLIP_MASK:
230
.octa 0x000102030405060708090a0b0c0d0e0f