1
C nettle, low-level cryptographics library
3
C Copyright (C) 2013 Niels Möller
5
C The nettle library is free software; you can redistribute it and/or modify
6
C it under the terms of the GNU Lesser General Public License as published by
7
C the Free Software Foundation; either version 2.1 of the License, or (at your
8
C option) any later version.
10
C The nettle library is distributed in the hope that it will be useful, but
11
C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
12
C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
13
C License for more details.
15
C You should have received a copy of the GNU Lesser General Public License
16
C along with the nettle library; see the file COPYING.LIB. If not, write to
17
C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
20
.file "sha512-compress.asm"
27
define(<SHIFT>, <r12>)
42
C d8-d15 are callee-save
53
C Used only when reading the input, can overlap with state
74
define(<QW0001>, <q8>)
75
define(<QW0203>, <q9>)
76
define(<QW0405>, <q10>)
77
define(<QW0607>, <q11>)
78
define(<QW0809>, <q12>)
79
define(<QW1011>, <q13>)
80
define(<QW1213>, <q14>)
81
define(<QW1415>, <q15>)
83
define(<EXPAND_ME>, <$1>)
84
define(<W>, <EXPAND_ME(<DW>eval(($1) % 16))>)
86
C If x = W(i+14), y = w(i+1), we xor in parallel
93
C -----------------------------
96
vshl.i64 DT0, W($1+14), #45
97
vshl.i64 DT1, W($1 + 1), #63
98
vshr.u64 DT2, W($1+14), #19
99
vshr.u64 DT3, W($1 + 1), #1
100
vshl.i64 DT4, W($1+14), #3
101
vshl.i64 DT5, W($1 + 1), #56
102
veor.i64 QT01, QT01, QT23
103
vshr.u64 DT2, W($1+14), #61
104
vshr.u64 DT3, W($1 + 1), #8
105
veor.i64 QT01, QT01, QT45
106
vshr.u64 DT4, W($1+14), #6
107
vshr.u64 DT5, W($1 + 1), #7
108
veor.i64 QT01, QT01, QT23
109
vadd.i64 W($1), W($1), W($1 + 9)
110
veor.i64 QT01, QT01, QT45
111
vadd.i64 W($1), W($1), DT0
112
vadd.i64 W($1), W($1), DT1
115
C ROUND(A,B,C,D,E,F,G,H,i)
117
C H += S1(E) + Choice(E,F,G) + K + W
119
C H += S0(A) + Majority(A,B,C)
123
C S1(E) = E<<<50 ^ E<<<46 ^ E<<<23
124
C S0(A) = A<<<36 ^ A<<<30 ^ A<<<25
125
C Choice (E, F, G) = G^(E&(F^G))
126
C Majority (A,B,C) = (A&B) + (C&(A^B))
128
C Do S1 and S0 in parallel
135
C xor e >> 41 a >> 39
136
C ----------------------------
139
vshl.i64 DT0, $5, #50
140
vshl.i64 DT1, $1, #36
141
vshr.u64 DT2, $5, #14
142
vshr.u64 DT3, $1, #28
143
vshl.i64 DT4, $5, #46
144
vshl.i64 DT5, $1, #30
145
veor QT01, QT01, QT23
146
vshr.u64 DT2, $5, #18
147
vshr.u64 DT3, $1, #34
148
veor QT01, QT01, QT45
149
vshl.i64 DT4, $5, #23
150
vshl.i64 DT5, $1, #25
151
veor QT01, QT01, QT23
152
vshr.u64 DT2, $5, #41
153
vshr.u64 DT3, $1, #39
154
veor QT01, QT01, QT45
160
veor QT01, QT01, QT23
162
vldr DT3, [K,#eval(8*$9)]
163
vadd.i64 $8, $8, W($9)
164
vadd.i64 QT01, QT01, QT45
167
vadd.i64 DT1, DT1, DT2
173
C _nettle_sha512_compress(uint64_t *state, const uint8_t *input, const uint64_t *k)
178
PROLOGUE(_nettle_sha512_compress)
179
vpush {d8,d9,d10,d11,d12,d13}
181
ands SHIFT, INPUT, #7
182
and INPUT, INPUT, #-8
183
vld1.8 {DT5}, [INPUT :64]
184
addne INPUT, INPUT, #8
185
addeq SHIFT, SHIFT, #8
188
C Put right shift in DT0 and DT1, aka QT01
191
vmov.32 DT0[0], SHIFT
193
C Put left shift in DT2 and DT3, aka QT23
194
add SHIFT, SHIFT, #64
196
vmov.32 DT2[0], SHIFT
198
vshl.u64 DT5, DT5, DT0
200
C Set w[i] <-- w[i-1] >> RSHIFT + w[i] << LSHIFT
201
vld1.8 {W(0),W(1),W(2),W(3)}, [INPUT :64]!
202
vshl.u64 QT67, QW0001, QT01 C Right shift
203
vshl.u64 QW0001, QW0001, QT23 C Left shift
206
vrev64.8 QW0001, QW0001
207
vshl.u64 QT45, QW0203, QT01 C Right shift
208
vshl.u64 QW0203, QW0203, QT23 C Left shift
211
vrev64.8 QW0203, QW0203
213
vld1.8 {W(4),W(5),W(6),W(7)}, [INPUT :64]!
214
vshl.u64 QT67, QW0405, QT01 C Right shift
215
vshl.u64 QW0405, QW0405, QT23 C Left shift
218
vrev64.8 QW0405, QW0405
219
vshl.u64 QT45, QW0607, QT01 C Right shift
220
vshl.u64 QW0607, QW0607, QT23 C Left shift
223
vrev64.8 QW0607, QW0607
225
vld1.8 {W(8),W(9),W(10),W(11)}, [INPUT :64]!
226
vshl.u64 QT67, QW0809, QT01 C Right shift
227
vshl.u64 QW0809, QW0809, QT23 C Left shift
230
vrev64.8 QW0809, QW0809
231
vshl.u64 QT45, QW1011, QT01 C Right shift
232
vshl.u64 QW1011, QW1011, QT23 C Left shift
233
veor W(10), W(10), DT7
234
veor W(11), W(11), DT4
235
vrev64.8 QW1011, QW1011
237
vld1.8 {W(12),W(13),W(14),W(15)}, [INPUT :64]!
238
vshl.u64 QT67, QW1213, QT01 C Right shift
239
vshl.u64 QW1213, QW1213, QT23 C Left shift
240
veor W(12), W(12), DT5
241
veor W(13), W(13), DT6
242
vrev64.8 QW1213, QW1213
243
vshl.u64 QT45, QW1415, QT01 C Right shift
244
vshl.u64 QW1415, QW1415, QT23 C Left shift
245
veor W(14), W(14), DT7
246
veor W(15), W(15), DT4
247
vrev64.8 QW1415, QW1415
249
vldm STATE, {SA,SB,SC,SD,SE,SF,SG,SH}
251
ROUND(SA,SB,SC,SD,SE,SF,SG,SH, 0)
252
ROUND(SH,SA,SB,SC,SD,SE,SF,SG, 1)
253
ROUND(SG,SH,SA,SB,SC,SD,SE,SF, 2)
254
ROUND(SF,SG,SH,SA,SB,SC,SD,SE, 3)
255
ROUND(SE,SF,SG,SH,SA,SB,SC,SD, 4)
256
ROUND(SD,SE,SF,SG,SH,SA,SB,SC, 5)
257
ROUND(SC,SD,SE,SF,SG,SH,SA,SB, 6)
258
ROUND(SB,SC,SD,SE,SF,SG,SH,SA, 7)
260
ROUND(SA,SB,SC,SD,SE,SF,SG,SH, 8)
261
ROUND(SH,SA,SB,SC,SD,SE,SF,SG, 9)
262
ROUND(SG,SH,SA,SB,SC,SD,SE,SF, 10)
263
ROUND(SF,SG,SH,SA,SB,SC,SD,SE, 11)
264
ROUND(SE,SF,SG,SH,SA,SB,SC,SD, 12)
265
ROUND(SD,SE,SF,SG,SH,SA,SB,SC, 13)
266
ROUND(SC,SD,SE,SF,SG,SH,SA,SB, 14)
267
ROUND(SB,SC,SD,SE,SF,SG,SH,SA, 15)
274
EXPN( 0) ROUND(SA,SB,SC,SD,SE,SF,SG,SH, 0)
275
EXPN( 1) ROUND(SH,SA,SB,SC,SD,SE,SF,SG, 1)
276
EXPN( 2) ROUND(SG,SH,SA,SB,SC,SD,SE,SF, 2)
277
EXPN( 3) ROUND(SF,SG,SH,SA,SB,SC,SD,SE, 3)
278
EXPN( 4) ROUND(SE,SF,SG,SH,SA,SB,SC,SD, 4)
279
EXPN( 5) ROUND(SD,SE,SF,SG,SH,SA,SB,SC, 5)
280
EXPN( 6) ROUND(SC,SD,SE,SF,SG,SH,SA,SB, 6)
281
EXPN( 7) ROUND(SB,SC,SD,SE,SF,SG,SH,SA, 7)
282
EXPN( 8) ROUND(SA,SB,SC,SD,SE,SF,SG,SH, 8)
283
EXPN( 9) ROUND(SH,SA,SB,SC,SD,SE,SF,SG, 9)
284
EXPN(10) ROUND(SG,SH,SA,SB,SC,SD,SE,SF, 10)
285
EXPN(11) ROUND(SF,SG,SH,SA,SB,SC,SD,SE, 11)
286
EXPN(12) ROUND(SE,SF,SG,SH,SA,SB,SC,SD, 12)
287
EXPN(13) ROUND(SD,SE,SF,SG,SH,SA,SB,SC, 13)
288
EXPN(14) ROUND(SC,SD,SE,SF,SG,SH,SA,SB, 14)
289
subs COUNT, COUNT, #1
290
EXPN(15) ROUND(SB,SC,SD,SE,SF,SG,SH,SA, 15)
294
vld1.64 {DW0, DW1, DW2, DW3}, [STATE]
295
vadd.i64 QSAB, QSAB, QW0001
296
vadd.i64 QSCD, QSCD, QW0203
297
vst1.64 {SA,SB,SC,SD}, [STATE]!
298
vld1.64 {DW0, DW1, DW2, DW3}, [STATE]
299
vadd.i64 QSEF, QSEF, QW0001
300
vadd.i64 QSGH, QSGH, QW0203
301
vst1.64 {SE,SF,SG,SH}, [STATE]!
303
vpop {d8,d9,d10,d11,d12,d13}
305
EPILOGUE(_nettle_sha512_compress)