1
! -*- mode: asm; asm-comment-char: ?!; -*-
2
! nettle, low-level cryptographics library
4
! Copyright (C) 2002 Niels M�ller
6
! The nettle library is free software; you can redistribute it and/or modify
7
! it under the terms of the GNU Lesser General Public License as published by
8
! the Free Software Foundation; either version 2.1 of the License, or (at your
9
! option) any later version.
11
! The nettle library is distributed in the hope that it will be useful, but
12
! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13
! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
14
! License for more details.
16
! You should have received a copy of the GNU Lesser General Public License
17
! along with the nettle library; see the file COPYING.LIB. If not, write to
18
! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
21
! FIXME: For improved ultra sparc performance, we should avoid ALU
22
! instructions that use the result of an immediately preceeding ALU
23
! instruction. It is also a good idea to have a greater distance than
24
! one instruction between a load and use of its value, as that reduces
25
! the penalty for cache misses. Such instruction sequences are marked
28
! NOTE: Some of the %g registers are reserved for operating system etc
29
! (see gcc/config/sparc.h). The only %g registers that seems safe to
32
! Used registers: %l0,1,2,3,4,5,6,7
33
! %i0,1,2,3,4 (%i6=%fp, %i7 = return)
34
! %o0,1,2,3,4 (%o6=%sp)
52
! Further loop invariants
67
C IDX1 cointains the permutation values * 4 + 2
68
define(IDX1, <T + AES_SIDX1 >)
69
C IDX3 cointains the permutation values * 4
70
define(IDX3, <T + AES_SIDX3 >)
74
C Get one word of input, XOR with first subkey, store in wtxt
95
C Compute one word in the round function.
96
C Input in wtxt, output stored in tmp + i.
98
C The comments mark which j in T->table[j][ Bj(wtxt[IDXi(i)]) ]
99
C the instruction is a part of.
100
define(<AES_ROUND>, <
102
ldub [wtxt+$1+3], t0 ! 0
103
ldub [wtxt+t1], t1 ! 1
108
ld [T1+t1], t1 ! 1 !U
111
xor t0, t1, t0 ! 0, 1
113
ldub [wtxt+eval($1 ^ 8)+1], t1 ! 2
114
ldub [wtxt+t2], t2 ! 3
117
ld [T2+t1], t1 ! 2 !U
119
ld [T3+t2], t2 ! 3 !U
120
xor t0, t1, t0 ! 0, 1, 2
124
xor t0, t2, t0 ! 0, 1, 2, 3
126
st t0, [tmp + $1]>)dnl
129
C Compute one word in the final round function.
130
C Input in wtxt, output converted to an octet string and stored at dst.
132
C The comments mark which j in T->table[j][ Bj(wtxt[IDXi(i)]) ]
133
C the instruction is a part of.
134
define(<AES_FINAL_ROUND>, <
136
ldub [wtxt+$1+3], t0 ! 0
137
ldub [wtxt+t1], t1 ! 1
141
ld [IDX3 + $1], t2 ! 3
143
or t0, t1, t0 ! 0, 1 !U
146
ldub [wtxt+eval($1 ^ 8)+1], t1 ! 2
147
ldub [wtxt+t2], t2 ! 3
152
or t0, t1, t0 ! 0, 1, 2 !U
156
or t0, t2, t0 ! 0, 1, 2, 3
159
stb t1, [dst+$1+3] !U
162
stb t1, [dst+$1+2] !U
164
stb t1, [dst+$1+1] !U
166
stb t0, [dst+$1]>)dnl
168
C The stack frame looks like
170
C %fp - 4: OS-dependent link field
171
C %fp - 8: OS-dependent link field
172
C %fp - 24: tmp, uint32_t[4]
173
C %fp - 40: wtxt, uint32_t[4]
174
C %fp - 136: OS register save area.
175
define(<FRAME_SIZE>, 136)
179
.global _nettle_aes_crypt
180
.type _nettle_aes_crypt,#function
184
save %sp, -FRAME_SIZE, %sp
191
ld [ctx + AES_NROUNDS], nrounds
192
! Compute xor, so that we can swap efficiently.
194
! The loop variable will be multiplied by 16.
195
! More loop invariants
196
add T, AES_TABLE0, T0
198
add T, AES_TABLE1, T1
199
add T, AES_TABLE2, T2
200
add T, AES_TABLE3, T3
204
C Read src, and add initial subkey
211
sub nrounds, 1, round
219
AES_ROUND(12) ! i = 3
221
! switch roles for tmp and wtxt
224
subcc round, 1, round
229
C Final round, and storage of the output
231
AES_FINAL_ROUND(0) ! i = 0
232
AES_FINAL_ROUND(4) ! i = 1
233
AES_FINAL_ROUND(8) ! i = 2
234
AES_FINAL_ROUND(12) ! i = 3
236
addcc length, -16, length
244
.size _nettle_aes_crypt,.Leord-_nettle_aes_crypt
246
! Benchmarks on my slow sparcstation:
248
! aes128 (ECB encrypt): 14.36s, 0.696MB/s
249
! aes128 (ECB decrypt): 17.19s, 0.582MB/s
250
! aes128 (CBC encrypt): 16.08s, 0.622MB/s
251
! aes128 ((CBC decrypt)): 18.79s, 0.532MB/s
253
! aes192 (ECB encrypt): 16.85s, 0.593MB/s
254
! aes192 (ECB decrypt): 19.64s, 0.509MB/s
255
! aes192 (CBC encrypt): 18.43s, 0.543MB/s
256
! aes192 (CBC decrypt): 20.76s, 0.482MB/s
258
! aes256 (ECB encrypt): 19.12s, 0.523MB/s
259
! aes256 (ECB decrypt): 22.57s, 0.443MB/s
260
! aes256 (CBC encrypt): 20.92s, 0.478MB/s
261
! aes256 (CBC decrypt): 23.22s, 0.431MB/s
263
! After unrolling key_addition32, and getting rid of
264
! some sll x, 2, x, encryption speed is 0.760 MB/s.
266
! Next, the C code was optimized to use larger tables and
267
! no rotates. New timings:
268
! aes128 (ECB encrypt): 13.10s, 0.763MB/s
269
! aes128 (ECB decrypt): 11.51s, 0.869MB/s
270
! aes128 (CBC encrypt): 15.15s, 0.660MB/s
271
! aes128 (CBC decrypt): 13.10s, 0.763MB/s
273
! aes192 (ECB encrypt): 15.68s, 0.638MB/s
274
! aes192 (ECB decrypt): 13.59s, 0.736MB/s
275
! aes192 (CBC encrypt): 17.65s, 0.567MB/s
276
! aes192 (CBC decrypt): 15.31s, 0.653MB/s
278
! aes256 (ECB encrypt): 17.95s, 0.557MB/s
279
! aes256 (ECB decrypt): 15.90s, 0.629MB/s
280
! aes256 (CBC encrypt): 20.16s, 0.496MB/s
281
! aes256 (CBC decrypt): 17.47s, 0.572MB/s
283
! After optimization using pre-shifted indices
285
! aes128 (ECB encrypt): 12.46s, 0.803MB/s
286
! aes128 (ECB decrypt): 10.74s, 0.931MB/s
287
! aes128 (CBC encrypt): 17.74s, 0.564MB/s
288
! aes128 (CBC decrypt): 12.43s, 0.805MB/s
290
! aes192 (ECB encrypt): 14.59s, 0.685MB/s
291
! aes192 (ECB decrypt): 12.76s, 0.784MB/s
292
! aes192 (CBC encrypt): 19.97s, 0.501MB/s
293
! aes192 (CBC decrypt): 14.46s, 0.692MB/s
295
! aes256 (ECB encrypt): 17.00s, 0.588MB/s
296
! aes256 (ECB decrypt): 14.81s, 0.675MB/s
297
! aes256 (CBC encrypt): 22.65s, 0.442MB/s
298
! aes256 (CBC decrypt): 16.46s, 0.608MB/s
300
! After implementing double buffering
301
! aes128 (ECB encrypt): 12.59s, 0.794MB/s
302
! aes128 (ECB decrypt): 10.56s, 0.947MB/s
303
! aes128 (CBC encrypt): 17.91s, 0.558MB/s
304
! aes128 (CBC decrypt): 12.30s, 0.813MB/s
306
! aes192 (ECB encrypt): 15.03s, 0.665MB/s
307
! aes192 (ECB decrypt): 12.56s, 0.796MB/s
308
! aes192 (CBC encrypt): 20.30s, 0.493MB/s
309
! aes192 (CBC decrypt): 14.26s, 0.701MB/s
311
! aes256 (ECB encrypt): 17.30s, 0.578MB/s
312
! aes256 (ECB decrypt): 14.51s, 0.689MB/s
313
! aes256 (CBC encrypt): 22.75s, 0.440MB/s
314
! aes256 (CBC decrypt): 16.35s, 0.612MB/s
316
! After reordering aes-encrypt.c and aes-decypt.c
317
! (the order probably causes strange cache-effects):
318
! aes128 (ECB encrypt): 9.21s, 1.086MB/s
319
! aes128 (ECB decrypt): 11.13s, 0.898MB/s
320
! aes128 (CBC encrypt): 14.12s, 0.708MB/s
321
! aes128 (CBC decrypt): 13.77s, 0.726MB/s
323
! aes192 (ECB encrypt): 10.86s, 0.921MB/s
324
! aes192 (ECB decrypt): 13.17s, 0.759MB/s
325
! aes192 (CBC encrypt): 15.74s, 0.635MB/s
326
! aes192 (CBC decrypt): 15.91s, 0.629MB/s
328
! aes256 (ECB encrypt): 12.71s, 0.787MB/s
329
! aes256 (ECB decrypt): 15.38s, 0.650MB/s
330
! aes256 (CBC encrypt): 17.49s, 0.572MB/s
331
! aes256 (CBC decrypt): 17.87s, 0.560MB/s
333
! After further optimizations of the initial and final loops,
334
! source_loop and final_loop.
335
! aes128 (ECB encrypt): 8.07s, 1.239MB/s
336
! aes128 (ECB decrypt): 9.48s, 1.055MB/s
337
! aes128 (CBC encrypt): 12.76s, 0.784MB/s
338
! aes128 (CBC decrypt): 12.15s, 0.823MB/s
340
! aes192 (ECB encrypt): 9.43s, 1.060MB/s
341
! aes192 (ECB decrypt): 11.20s, 0.893MB/s
342
! aes192 (CBC encrypt): 14.19s, 0.705MB/s
343
! aes192 (CBC decrypt): 13.97s, 0.716MB/s
345
! aes256 (ECB encrypt): 10.81s, 0.925MB/s
346
! aes256 (ECB decrypt): 12.92s, 0.774MB/s
347
! aes256 (CBC encrypt): 15.59s, 0.641MB/s
348
! aes256 (CBC decrypt): 15.76s, 0.635MB/s
350
! After unrolling loops, and other optimizations suggested by
352
! aes128 (ECB encrypt): 6.40s, 1.562MB/s
353
! aes128 (ECB decrypt): 8.17s, 1.224MB/s
354
! aes128 (CBC encrypt): 13.11s, 0.763MB/s
355
! aes128 (CBC decrypt): 10.05s, 0.995MB/s
357
! aes192 (ECB encrypt): 7.43s, 1.346MB/s
358
! aes192 (ECB decrypt): 9.51s, 1.052MB/s
359
! aes192 (CBC encrypt): 14.09s, 0.710MB/s
360
! aes192 (CBC decrypt): 11.58s, 0.864MB/s
362
! aes256 (ECB encrypt): 8.57s, 1.167MB/s
363
! aes256 (ECB decrypt): 11.13s, 0.898MB/s
364
! aes256 (CBC encrypt): 15.30s, 0.654MB/s
365
! aes256 (CBC decrypt): 12.93s, 0.773MB/s