3
# ====================================================================
4
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5
# project. The module is, however, dual licensed under OpenSSL and
6
# CRYPTOGAMS licenses depending on where you obtain it. For further
7
# details see http://www.openssl.org/~appro/cryptogams/.
8
# ====================================================================
10
# 2.22x RC4 tune-up:-) It should be noted though that my hand [as in
11
# "hand-coded assembler"] doesn't stand for the whole improvement
12
# coefficient. It turned out that eliminating RC4_CHAR from config
13
# line results in ~40% improvement (yes, even for C implementation).
14
# Presumably it has everything to do with AMD cache architecture and
15
# RAW or whatever penalties. Once again! The module *requires* config
16
# line *without* RC4_CHAR! As for coding "secret," I bet on partial
17
# register arithmetics. For example instead of 'inc %r8; and $255,%r8'
18
# I simply 'inc %r8b'. Even though optimization manual discourages
19
# to operate on partial registers, it turned out to be the best bet.
20
# At least for AMD... How IA32E would perform remains to be seen...
22
# As was shown by Marc Bevand reordering of couple of load operations
23
# results in even higher performance gain of 3.3x:-) At least on
24
# Opteron... For reference, 1x in this case is RC4_CHAR C-code
25
# compiled with gcc 3.3.2, which performs at ~54MBps per 1GHz clock.
26
# Latter means that if you want to *estimate* what to expect from
27
# *your* Opteron, then multiply 54 by 3.3 and clock frequency in GHz.
29
# Intel P4 EM64T core was found to run the AMD64 code really slow...
30
# The only way to achieve comparable performance on P4 was to keep
31
# RC4_CHAR. Kind of ironic, huh? As it's apparently impossible to
32
# compose blended code, which would perform even within 30% marginal
33
# on either AMD and Intel platforms, I implement both cases. See
34
# rc4_skey.c for further details...
36
# P4 EM64T core appears to be "allergic" to 64-bit inc/dec. Replacing
37
# those with add/sub results in 50% performance improvement of folded
40
# As was shown by Zou Nanhai loop unrolling can improve Intel EM64T
41
# performance by >30% [unlike P4 32-bit case that is]. But this is
42
# provided that loads are reordered even more aggressively! Both code
43
# pathes, AMD64 and EM64T, reorder loads in essentially same manner
44
# as my IA-64 implementation. On Opteron this resulted in modest 5%
45
# improvement [I had to test it], while final Intel P4 performance
46
# achieves respectful 432MBps on 2.8GHz processor now. For reference.
47
# If executed on Xeon, current RC4_CHAR code-path is 2.7x faster than
48
# RC4_INT code-path. While if executed on Opteron, it's only 25%
49
# slower than the RC4_INT one [meaning that if CPU �-arch detection
50
# is not implemented, then this final RC4_CHAR code-path should be
51
# preferred, as it provides better *all-round* performance].
53
# Intel Core2 was observed to perform poorly on both code paths:-( It
54
# apparently suffers from some kind of partial register stall, which
55
# occurs in 64-bit mode only [as virtually identical 32-bit loop was
56
# observed to outperform 64-bit one by almost 50%]. Adding two movzb to
57
# cloop1 boosts its performance by 80%! This loop appears to be optimal
58
# fit for Core2 and therefore the code was modified to skip cloop8 on
63
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
64
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
65
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
66
die "can't locate x86_64-xlate.pl";
68
open STDOUT,"| $^X $xlate $output";
84
.type RC4,\@function,4
94
movl -8($dat),$XX[0]#d
99
movl ($dat,$XX[0],4),$TX[0]#d
106
for ($i=0;$i<8;$i++) {
110
movl ($dat,$YY,4),$TY#d
111
ror \$8,%rax # ror is redundant when $i=0
113
movl ($dat,$XX[1],4),$TX[1]#d
115
movl $TX[0]#d,($dat,$YY,4)
117
movl $TY#d,($dat,$XX[0],4)
119
movb ($dat,$TY,4),%al
121
push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers
140
movl $XX[0]#d,-8($dat)
149
movl ($dat,$YY,4),$TY#d
150
movl $TX[0]#d,($dat,$YY,4)
151
movl $TY#d,($dat,$XX[0],4)
154
movl ($dat,$TX[0],4),$TY#d
155
movl ($dat,$XX[0],4),$TX[0]#d
167
movzb ($dat,$XX[0]),$TX[0]#d
179
# unroll 2x4-wise, because 64-bit rotates kill Intel P4...
180
for ($i=0;$i<4;$i++) {
184
movzb ($dat,$YY),$TY#d
185
movzb $XX[1]#b,$XX[1]#d
186
movzb ($dat,$XX[1]),$TX[1]#d
187
movb $TX[0]#b,($dat,$YY)
189
movb $TY#b,($dat,$XX[0])
190
jne .Lcmov$i # Intel cmov is sloooow...
197
push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers
199
for ($i=4;$i<8;$i++) {
203
movzb ($dat,$YY),$TY#d
204
movzb $XX[1]#b,$XX[1]#d
205
movzb ($dat,$XX[1]),$TX[1]#d
206
movb $TX[0]#b,($dat,$YY)
208
movb $TY#b,($dat,$XX[0])
209
jne .Lcmov$i # Intel cmov is sloooow...
216
push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers
236
movzb ($dat,$YY),$TY#d
237
movb $TX[0]#b,($dat,$YY)
238
movb $TY#b,($dat,$XX[0])
242
movzb $XX[0]#b,$XX[0]#d
243
movzb ($dat,$TY),$TY#d
244
movzb ($dat,$XX[0]),$TX[0]#d
259
.extern OPENSSL_ia32cap_P
261
.type RC4_set_key,\@function,3
274
mov OPENSSL_ia32cap_P\@GOTPCREL(%rip),$idx#d
276
mov OPENSSL_ia32cap_P(%rip),$idx#d
287
mov %eax,($dat,%rax,4)
295
mov ($dat,$ido,4),%r10d
296
add ($inp,$len,1),$idx#b
299
mov ($dat,$idx,4),%r11d
301
mov %r10d,($dat,$idx,4)
302
mov %r11d,($dat,$ido,4)
317
mov ($dat,$ido),%r10b
318
add ($inp,$len),$idx#b
321
mov ($dat,$idx),%r11b
325
mov %r10b,($dat,$idx)
326
mov %r11b,($dat,$ido)
337
.size RC4_set_key,.-RC4_set_key
340
.type RC4_options,\@function,0
344
lea .Lopts-.(%rax),%rax
346
mov OPENSSL_ia32cap_P\@GOTPCREL(%rip),%edx
348
mov OPENSSL_ia32cap_P(%rip),%edx
361
.asciz "rc4(8x,char)"
362
.asciz "rc4(1x,char)"
363
.asciz "RC4 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
365
.size RC4_options,.-RC4_options
368
$code =~ s/#([bwd])/$1/gm;
370
$code =~ s/RC4_set_key/private_RC4_set_key/g if ($ENV{FIPSCANLIB} ne "");