~ubuntu-branches/ubuntu/trusty/openssl/trusty

1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1
#!/usr/bin/env perl
2
#
3
# ====================================================================
4
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
5
# project. The module is, however, dual licensed under OpenSSL and
6
# CRYPTOGAMS licenses depending on where you obtain it. For further
7
# details see http://www.openssl.org/~appro/cryptogams/.
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
8
# ====================================================================
9
#
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
10
# Version 2.1.
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
11
#
12
# aes-*-cbc benchmarks are improved by >70% [compared to gcc 3.3.2 on
13
# Opteron 240 CPU] plus all the bells-n-whistles from 32-bit version
14
# [you'll notice a lot of resemblance], such as compressed S-boxes
15
# in little-endian byte order, prefetch of these tables in CBC mode,
16
# as well as avoiding L1 cache aliasing between stack frame and key
17
# schedule and already mentioned tables, compressed Td4...
18
#
19
# Performance in number of cycles per processed byte for 128-bit key:
20
#
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
21
#		ECB encrypt	ECB decrypt	CBC large chunk
22
# AMD64		33		41		13.0
23
# EM64T		38		59		18.6(*)
24
# Core 2	30		43		14.5(*)
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
25
#
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
26
# (*) with hyper-threading off
27
28
$flavour = shift;
29
$output  = shift;
30
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
31
32
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
33
34
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
35
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
36
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
37
die "can't locate x86_64-xlate.pl";
38
1.2.10 by Kurt Roeckx
Import upstream version 1.0.1e
39
open OUT,"| \"$^X\" $xlate $flavour $output";
40
*STDOUT=*OUT;
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
41
42
$verticalspin=1;	# unlike 32-bit version $verticalspin performs
43
			# ~15% better on both AMD and Intel cores
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
44
$speed_limit=512;	# see aes-586.pl for details
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
45
46
$code=".text\n";
47
48
$s0="%eax";
49
$s1="%ebx";
50
$s2="%ecx";
51
$s3="%edx";
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
52
$acc0="%esi";	$mask80="%rsi";
53
$acc1="%edi";	$maskfe="%rdi";
54
$acc2="%ebp";	$mask1b="%rbp";
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
55
$inp="%r8";
56
$out="%r9";
57
$t0="%r10d";
58
$t1="%r11d";
59
$t2="%r12d";
60
$rnds="%r13d";
61
$sbox="%r14";
62
$key="%r15";
63
64
sub hi() { my $r=shift;	$r =~ s/%[er]([a-d])x/%\1h/;	$r; }
65
sub lo() { my $r=shift;	$r =~ s/%[er]([a-d])x/%\1l/;
66
			$r =~ s/%[er]([sd]i)/%\1l/;
67
			$r =~ s/%(r[0-9]+)[d]?/%\1b/;	$r; }
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
68
sub LO() { my $r=shift; $r =~ s/%r([a-z]+)/%e\1/;
69
			$r =~ s/%r([0-9]+)/%r\1d/;	$r; }
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
70
sub _data_word()
71
{ my $i;
72
    while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
73
}
74
sub data_word()
75
{ my $i;
76
  my $last=pop(@_);
77
    $code.=".long\t";
78
    while(defined($i=shift)) { $code.=sprintf"0x%08x,",$i; }
79
    $code.=sprintf"0x%08x\n",$last;
80
}
81
82
sub data_byte()
83
{ my $i;
84
  my $last=pop(@_);
85
    $code.=".byte\t";
86
    while(defined($i=shift)) { $code.=sprintf"0x%02x,",$i&0xff; }
87
    $code.=sprintf"0x%02x\n",$last&0xff;
88
}
89
90
sub encvert()
91
{ my $t3="%r8d";	# zaps $inp!
92
93
$code.=<<___;
94
	# favor 3-way issue Opteron pipeline...
95
	movzb	`&lo("$s0")`,$acc0
96
	movzb	`&lo("$s1")`,$acc1
97
	movzb	`&lo("$s2")`,$acc2
98
	mov	0($sbox,$acc0,8),$t0
99
	mov	0($sbox,$acc1,8),$t1
100
	mov	0($sbox,$acc2,8),$t2
101
102
	movzb	`&hi("$s1")`,$acc0
103
	movzb	`&hi("$s2")`,$acc1
104
	movzb	`&lo("$s3")`,$acc2
105
	xor	3($sbox,$acc0,8),$t0
106
	xor	3($sbox,$acc1,8),$t1
107
	mov	0($sbox,$acc2,8),$t3
108
109
	movzb	`&hi("$s3")`,$acc0
110
	shr	\$16,$s2
111
	movzb	`&hi("$s0")`,$acc2
112
	xor	3($sbox,$acc0,8),$t2
113
	shr	\$16,$s3
114
	xor	3($sbox,$acc2,8),$t3
115
116
	shr	\$16,$s1
117
	lea	16($key),$key
118
	shr	\$16,$s0
119
120
	movzb	`&lo("$s2")`,$acc0
121
	movzb	`&lo("$s3")`,$acc1
122
	movzb	`&lo("$s0")`,$acc2
123
	xor	2($sbox,$acc0,8),$t0
124
	xor	2($sbox,$acc1,8),$t1
125
	xor	2($sbox,$acc2,8),$t2
126
127
	movzb	`&hi("$s3")`,$acc0
128
	movzb	`&hi("$s0")`,$acc1
129
	movzb	`&lo("$s1")`,$acc2
130
	xor	1($sbox,$acc0,8),$t0
131
	xor	1($sbox,$acc1,8),$t1
132
	xor	2($sbox,$acc2,8),$t3
133
134
	mov	12($key),$s3
135
	movzb	`&hi("$s1")`,$acc1
136
	movzb	`&hi("$s2")`,$acc2
137
	mov	0($key),$s0
138
	xor	1($sbox,$acc1,8),$t2
139
	xor	1($sbox,$acc2,8),$t3
140
141
	mov	4($key),$s1
142
	mov	8($key),$s2
143
	xor	$t0,$s0
144
	xor	$t1,$s1
145
	xor	$t2,$s2
146
	xor	$t3,$s3
147
___
148
}
149
150
sub enclastvert()
151
{ my $t3="%r8d";	# zaps $inp!
152
153
$code.=<<___;
154
	movzb	`&lo("$s0")`,$acc0
155
	movzb	`&lo("$s1")`,$acc1
156
	movzb	`&lo("$s2")`,$acc2
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
157
	movzb	2($sbox,$acc0,8),$t0
158
	movzb	2($sbox,$acc1,8),$t1
159
	movzb	2($sbox,$acc2,8),$t2
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
160
161
	movzb	`&lo("$s3")`,$acc0
162
	movzb	`&hi("$s1")`,$acc1
163
	movzb	`&hi("$s2")`,$acc2
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
164
	movzb	2($sbox,$acc0,8),$t3
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
165
	mov	0($sbox,$acc1,8),$acc1	#$t0
166
	mov	0($sbox,$acc2,8),$acc2	#$t1
167
168
	and	\$0x0000ff00,$acc1
169
	and	\$0x0000ff00,$acc2
170
171
	xor	$acc1,$t0
172
	xor	$acc2,$t1
173
	shr	\$16,$s2
174
175
	movzb	`&hi("$s3")`,$acc0
176
	movzb	`&hi("$s0")`,$acc1
177
	shr	\$16,$s3
178
	mov	0($sbox,$acc0,8),$acc0	#$t2
179
	mov	0($sbox,$acc1,8),$acc1	#$t3
180
181
	and	\$0x0000ff00,$acc0
182
	and	\$0x0000ff00,$acc1
183
	shr	\$16,$s1
184
	xor	$acc0,$t2
185
	xor	$acc1,$t3
186
	shr	\$16,$s0
187
188
	movzb	`&lo("$s2")`,$acc0
189
	movzb	`&lo("$s3")`,$acc1
190
	movzb	`&lo("$s0")`,$acc2
191
	mov	0($sbox,$acc0,8),$acc0	#$t0
192
	mov	0($sbox,$acc1,8),$acc1	#$t1
193
	mov	0($sbox,$acc2,8),$acc2	#$t2
194
195
	and	\$0x00ff0000,$acc0
196
	and	\$0x00ff0000,$acc1
197
	and	\$0x00ff0000,$acc2
198
199
	xor	$acc0,$t0
200
	xor	$acc1,$t1
201
	xor	$acc2,$t2
202
203
	movzb	`&lo("$s1")`,$acc0
204
	movzb	`&hi("$s3")`,$acc1
205
	movzb	`&hi("$s0")`,$acc2
206
	mov	0($sbox,$acc0,8),$acc0	#$t3
207
	mov	2($sbox,$acc1,8),$acc1	#$t0
208
	mov	2($sbox,$acc2,8),$acc2	#$t1
209
210
	and	\$0x00ff0000,$acc0
211
	and	\$0xff000000,$acc1
212
	and	\$0xff000000,$acc2
213
214
	xor	$acc0,$t3
215
	xor	$acc1,$t0
216
	xor	$acc2,$t1
217
218
	movzb	`&hi("$s1")`,$acc0
219
	movzb	`&hi("$s2")`,$acc1
220
	mov	16+12($key),$s3
221
	mov	2($sbox,$acc0,8),$acc0	#$t2
222
	mov	2($sbox,$acc1,8),$acc1	#$t3
223
	mov	16+0($key),$s0
224
225
	and	\$0xff000000,$acc0
226
	and	\$0xff000000,$acc1
227
228
	xor	$acc0,$t2
229
	xor	$acc1,$t3
230
231
	mov	16+4($key),$s1
232
	mov	16+8($key),$s2
233
	xor	$t0,$s0
234
	xor	$t1,$s1
235
	xor	$t2,$s2
236
	xor	$t3,$s3
237
___
238
}
239
240
sub encstep()
241
{ my ($i,@s) = @_;
242
  my $tmp0=$acc0;
243
  my $tmp1=$acc1;
244
  my $tmp2=$acc2;
245
  my $out=($t0,$t1,$t2,$s[0])[$i];
246
247
	if ($i==3) {
248
		$tmp0=$s[1];
249
		$tmp1=$s[2];
250
		$tmp2=$s[3];
251
	}
252
	$code.="	movzb	".&lo($s[0]).",$out\n";
253
	$code.="	mov	$s[2],$tmp1\n"		if ($i!=3);
254
	$code.="	lea	16($key),$key\n"	if ($i==0);
255
256
	$code.="	movzb	".&hi($s[1]).",$tmp0\n";
257
	$code.="	mov	0($sbox,$out,8),$out\n";
258
259
	$code.="	shr	\$16,$tmp1\n";
260
	$code.="	mov	$s[3],$tmp2\n"		if ($i!=3);
261
	$code.="	xor	3($sbox,$tmp0,8),$out\n";
262
263
	$code.="	movzb	".&lo($tmp1).",$tmp1\n";
264
	$code.="	shr	\$24,$tmp2\n";
265
	$code.="	xor	4*$i($key),$out\n";
266
267
	$code.="	xor	2($sbox,$tmp1,8),$out\n";
268
	$code.="	xor	1($sbox,$tmp2,8),$out\n";
269
270
	$code.="	mov	$t0,$s[1]\n"		if ($i==3);
271
	$code.="	mov	$t1,$s[2]\n"		if ($i==3);
272
	$code.="	mov	$t2,$s[3]\n"		if ($i==3);
273
	$code.="\n";
274
}
275
276
sub enclast()
277
{ my ($i,@s)=@_;
278
  my $tmp0=$acc0;
279
  my $tmp1=$acc1;
280
  my $tmp2=$acc2;
281
  my $out=($t0,$t1,$t2,$s[0])[$i];
282
283
	if ($i==3) {
284
		$tmp0=$s[1];
285
		$tmp1=$s[2];
286
		$tmp2=$s[3];
287
	}
288
	$code.="	movzb	".&lo($s[0]).",$out\n";
289
	$code.="	mov	$s[2],$tmp1\n"		if ($i!=3);
290
291
	$code.="	mov	2($sbox,$out,8),$out\n";
292
	$code.="	shr	\$16,$tmp1\n";
293
	$code.="	mov	$s[3],$tmp2\n"		if ($i!=3);
294
295
	$code.="	and	\$0x000000ff,$out\n";
296
	$code.="	movzb	".&hi($s[1]).",$tmp0\n";
297
	$code.="	movzb	".&lo($tmp1).",$tmp1\n";
298
	$code.="	shr	\$24,$tmp2\n";
299
300
	$code.="	mov	0($sbox,$tmp0,8),$tmp0\n";
301
	$code.="	mov	0($sbox,$tmp1,8),$tmp1\n";
302
	$code.="	mov	2($sbox,$tmp2,8),$tmp2\n";
303
304
	$code.="	and	\$0x0000ff00,$tmp0\n";
305
	$code.="	and	\$0x00ff0000,$tmp1\n";
306
	$code.="	and	\$0xff000000,$tmp2\n";
307
308
	$code.="	xor	$tmp0,$out\n";
309
	$code.="	mov	$t0,$s[1]\n"		if ($i==3);
310
	$code.="	xor	$tmp1,$out\n";
311
	$code.="	mov	$t1,$s[2]\n"		if ($i==3);
312
	$code.="	xor	$tmp2,$out\n";
313
	$code.="	mov	$t2,$s[3]\n"		if ($i==3);
314
	$code.="\n";
315
}
316
317
$code.=<<___;
318
.type	_x86_64_AES_encrypt,\@abi-omnipotent
319
.align	16
320
_x86_64_AES_encrypt:
321
	xor	0($key),$s0			# xor with key
322
	xor	4($key),$s1
323
	xor	8($key),$s2
324
	xor	12($key),$s3
325
326
	mov	240($key),$rnds			# load key->rounds
327
	sub	\$1,$rnds
328
	jmp	.Lenc_loop
329
.align	16
330
.Lenc_loop:
331
___
332
	if ($verticalspin) { &encvert(); }
333
	else {	&encstep(0,$s0,$s1,$s2,$s3);
334
		&encstep(1,$s1,$s2,$s3,$s0);
335
		&encstep(2,$s2,$s3,$s0,$s1);
336
		&encstep(3,$s3,$s0,$s1,$s2);
337
	}
338
$code.=<<___;
339
	sub	\$1,$rnds
340
	jnz	.Lenc_loop
341
___
342
	if ($verticalspin) { &enclastvert(); }
343
	else {	&enclast(0,$s0,$s1,$s2,$s3);
344
		&enclast(1,$s1,$s2,$s3,$s0);
345
		&enclast(2,$s2,$s3,$s0,$s1);
346
		&enclast(3,$s3,$s0,$s1,$s2);
347
		$code.=<<___;
348
		xor	16+0($key),$s0		# xor with key
349
		xor	16+4($key),$s1
350
		xor	16+8($key),$s2
351
		xor	16+12($key),$s3
352
___
353
	}
354
$code.=<<___;
355
	.byte	0xf3,0xc3			# rep ret
356
.size	_x86_64_AES_encrypt,.-_x86_64_AES_encrypt
357
___
358
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
359
# it's possible to implement this by shifting tN by 8, filling least
360
# significant byte with byte load and finally bswap-ing at the end,
361
# but such partial register load kills Core 2...
362
sub enccompactvert()
363
{ my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d");
364
365
$code.=<<___;
366
	movzb	`&lo("$s0")`,$t0
367
	movzb	`&lo("$s1")`,$t1
368
	movzb	`&lo("$s2")`,$t2
369
	movzb	($sbox,$t0,1),$t0
370
	movzb	($sbox,$t1,1),$t1
371
	movzb	($sbox,$t2,1),$t2
372
373
	movzb	`&lo("$s3")`,$t3
374
	movzb	`&hi("$s1")`,$acc0
375
	movzb	`&hi("$s2")`,$acc1
376
	movzb	($sbox,$t3,1),$t3
377
	movzb	($sbox,$acc0,1),$t4	#$t0
378
	movzb	($sbox,$acc1,1),$t5	#$t1
379
380
	movzb	`&hi("$s3")`,$acc2
381
	movzb	`&hi("$s0")`,$acc0
382
	shr	\$16,$s2
383
	movzb	($sbox,$acc2,1),$acc2	#$t2
384
	movzb	($sbox,$acc0,1),$acc0	#$t3
385
	shr	\$16,$s3
386
387
	movzb	`&lo("$s2")`,$acc1
388
	shl	\$8,$t4
389
	shl	\$8,$t5
390
	movzb	($sbox,$acc1,1),$acc1	#$t0
391
	xor	$t4,$t0
392
	xor	$t5,$t1
393
394
	movzb	`&lo("$s3")`,$t4
395
	shr	\$16,$s0
396
	shr	\$16,$s1
397
	movzb	`&lo("$s0")`,$t5
398
	shl	\$8,$acc2
399
	shl	\$8,$acc0
400
	movzb	($sbox,$t4,1),$t4	#$t1
401
	movzb	($sbox,$t5,1),$t5	#$t2
402
	xor	$acc2,$t2
403
	xor	$acc0,$t3
404
405
	movzb	`&lo("$s1")`,$acc2
406
	movzb	`&hi("$s3")`,$acc0
407
	shl	\$16,$acc1
408
	movzb	($sbox,$acc2,1),$acc2	#$t3
409
	movzb	($sbox,$acc0,1),$acc0	#$t0
410
	xor	$acc1,$t0
411
412
	movzb	`&hi("$s0")`,$acc1
413
	shr	\$8,$s2
414
	shr	\$8,$s1
415
	movzb	($sbox,$acc1,1),$acc1	#$t1
416
	movzb	($sbox,$s2,1),$s3	#$t3
417
	movzb	($sbox,$s1,1),$s2	#$t2
418
	shl	\$16,$t4
419
	shl	\$16,$t5
420
	shl	\$16,$acc2
421
	xor	$t4,$t1
422
	xor	$t5,$t2
423
	xor	$acc2,$t3
424
425
	shl	\$24,$acc0
426
	shl	\$24,$acc1
427
	shl	\$24,$s3
428
	xor	$acc0,$t0
429
	shl	\$24,$s2
430
	xor	$acc1,$t1
431
	mov	$t0,$s0
432
	mov	$t1,$s1
433
	xor	$t2,$s2
434
	xor	$t3,$s3
435
___
436
}
437
438
sub enctransform_ref()
439
{ my $sn = shift;
440
  my ($acc,$r2,$tmp)=("%r8d","%r9d","%r13d");
441
442
$code.=<<___;
443
	mov	$sn,$acc
444
	and	\$0x80808080,$acc
445
	mov	$acc,$tmp
446
	shr	\$7,$tmp
447
	lea	($sn,$sn),$r2
448
	sub	$tmp,$acc
449
	and	\$0xfefefefe,$r2
450
	and	\$0x1b1b1b1b,$acc
451
	mov	$sn,$tmp
452
	xor	$acc,$r2
453
454
	xor	$r2,$sn
455
	rol	\$24,$sn
456
	xor	$r2,$sn
457
	ror	\$16,$tmp
458
	xor	$tmp,$sn
459
	ror	\$8,$tmp
460
	xor	$tmp,$sn
461
___
462
}
463
464
# unlike decrypt case it does not pay off to parallelize enctransform
465
sub enctransform()
466
{ my ($t3,$r20,$r21)=($acc2,"%r8d","%r9d");
467
468
$code.=<<___;
469
	mov	$s0,$acc0
470
	mov	$s1,$acc1
471
	and	\$0x80808080,$acc0
472
	and	\$0x80808080,$acc1
473
	mov	$acc0,$t0
474
	mov	$acc1,$t1
475
	shr	\$7,$t0
476
	lea	($s0,$s0),$r20
477
	shr	\$7,$t1
478
	lea	($s1,$s1),$r21
479
	sub	$t0,$acc0
480
	sub	$t1,$acc1
481
	and	\$0xfefefefe,$r20
482
	and	\$0xfefefefe,$r21
483
	and	\$0x1b1b1b1b,$acc0
484
	and	\$0x1b1b1b1b,$acc1
485
	mov	$s0,$t0
486
	mov	$s1,$t1
487
	xor	$acc0,$r20
488
	xor	$acc1,$r21
489
490
	xor	$r20,$s0
491
	xor	$r21,$s1
492
	 mov	$s2,$acc0
493
	 mov	$s3,$acc1
494
	rol	\$24,$s0
495
	rol	\$24,$s1
496
	 and	\$0x80808080,$acc0
497
	 and	\$0x80808080,$acc1
498
	xor	$r20,$s0
499
	xor	$r21,$s1
500
	 mov	$acc0,$t2
501
	 mov	$acc1,$t3
502
	ror	\$16,$t0
503
	ror	\$16,$t1
504
	 shr	\$7,$t2
505
	 lea	($s2,$s2),$r20
506
	xor	$t0,$s0
507
	xor	$t1,$s1
508
	 shr	\$7,$t3
509
	 lea	($s3,$s3),$r21
510
	ror	\$8,$t0
511
	ror	\$8,$t1
512
	 sub	$t2,$acc0
513
	 sub	$t3,$acc1
514
	xor	$t0,$s0
515
	xor	$t1,$s1
516
517
	and	\$0xfefefefe,$r20
518
	and	\$0xfefefefe,$r21
519
	and	\$0x1b1b1b1b,$acc0
520
	and	\$0x1b1b1b1b,$acc1
521
	mov	$s2,$t2
522
	mov	$s3,$t3
523
	xor	$acc0,$r20
524
	xor	$acc1,$r21
525
526
	xor	$r20,$s2
527
	xor	$r21,$s3
528
	rol	\$24,$s2
529
	rol	\$24,$s3
530
	xor	$r20,$s2
531
	xor	$r21,$s3
532
	mov	0($sbox),$acc0			# prefetch Te4
533
	ror	\$16,$t2
534
	ror	\$16,$t3
535
	mov	64($sbox),$acc1
536
	xor	$t2,$s2
537
	xor	$t3,$s3
538
	mov	128($sbox),$r20
539
	ror	\$8,$t2
540
	ror	\$8,$t3
541
	mov	192($sbox),$r21
542
	xor	$t2,$s2
543
	xor	$t3,$s3
544
___
545
}
546
547
$code.=<<___;
548
.type	_x86_64_AES_encrypt_compact,\@abi-omnipotent
549
.align	16
550
_x86_64_AES_encrypt_compact:
551
	lea	128($sbox),$inp			# size optimization
552
	mov	0-128($inp),$acc1		# prefetch Te4
553
	mov	32-128($inp),$acc2
554
	mov	64-128($inp),$t0
555
	mov	96-128($inp),$t1
556
	mov	128-128($inp),$acc1
557
	mov	160-128($inp),$acc2
558
	mov	192-128($inp),$t0
559
	mov	224-128($inp),$t1
560
	jmp	.Lenc_loop_compact
561
.align	16
562
.Lenc_loop_compact:
563
		xor	0($key),$s0		# xor with key
564
		xor	4($key),$s1
565
		xor	8($key),$s2
566
		xor	12($key),$s3
567
		lea	16($key),$key
568
___
569
		&enccompactvert();
570
$code.=<<___;
571
		cmp	16(%rsp),$key
572
		je	.Lenc_compact_done
573
___
574
		&enctransform();
575
$code.=<<___;
576
	jmp	.Lenc_loop_compact
577
.align	16
578
.Lenc_compact_done:
579
	xor	0($key),$s0
580
	xor	4($key),$s1
581
	xor	8($key),$s2
582
	xor	12($key),$s3
583
	.byte	0xf3,0xc3			# rep ret
584
.size	_x86_64_AES_encrypt_compact,.-_x86_64_AES_encrypt_compact
585
___
586
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
587
# void AES_encrypt (const void *inp,void *out,const AES_KEY *key);
588
$code.=<<___;
589
.globl	AES_encrypt
590
.type	AES_encrypt,\@function,3
591
.align	16
1.2.6 by Kurt Roeckx
Import upstream version 1.0.1
592
.globl	asm_AES_encrypt
593
.hidden	asm_AES_encrypt
594
asm_AES_encrypt:
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
595
AES_encrypt:
596
	push	%rbx
597
	push	%rbp
598
	push	%r12
599
	push	%r13
600
	push	%r14
601
	push	%r15
602
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
603
	# allocate frame "above" key schedule
604
	mov	%rsp,%r10
605
	lea	-63(%rdx),%rcx	# %rdx is key argument
606
	and	\$-64,%rsp
607
	sub	%rsp,%rcx
608
	neg	%rcx
609
	and	\$0x3c0,%rcx
610
	sub	%rcx,%rsp
611
	sub	\$32,%rsp
612
613
	mov	%rsi,16(%rsp)	# save out
614
	mov	%r10,24(%rsp)	# save real stack pointer
615
.Lenc_prologue:
616
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
617
	mov	%rdx,$key
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
618
	mov	240($key),$rnds	# load rounds
619
620
	mov	0(%rdi),$s0	# load input vector
621
	mov	4(%rdi),$s1
622
	mov	8(%rdi),$s2
623
	mov	12(%rdi),$s3
624
625
	shl	\$4,$rnds
626
	lea	($key,$rnds),%rbp
627
	mov	$key,(%rsp)	# key schedule
628
	mov	%rbp,8(%rsp)	# end of key schedule
629
630
	# pick Te4 copy which can't "overlap" with stack frame or key schedule
631
	lea	.LAES_Te+2048(%rip),$sbox
632
	lea	768(%rsp),%rbp
633
	sub	$sbox,%rbp
634
	and	\$0x300,%rbp
635
	lea	($sbox,%rbp),$sbox
636
637
	call	_x86_64_AES_encrypt_compact
638
639
	mov	16(%rsp),$out	# restore out
640
	mov	24(%rsp),%rsi	# restore saved stack pointer
641
	mov	$s0,0($out)	# write output vector
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
642
	mov	$s1,4($out)
643
	mov	$s2,8($out)
644
	mov	$s3,12($out)
645
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
646
	mov	(%rsi),%r15
647
	mov	8(%rsi),%r14
648
	mov	16(%rsi),%r13
649
	mov	24(%rsi),%r12
650
	mov	32(%rsi),%rbp
651
	mov	40(%rsi),%rbx
652
	lea	48(%rsi),%rsp
653
.Lenc_epilogue:
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
654
	ret
655
.size	AES_encrypt,.-AES_encrypt
656
___
657
658
#------------------------------------------------------------------#
659
660
sub decvert()
661
{ my $t3="%r8d";	# zaps $inp!
662
663
$code.=<<___;
664
	# favor 3-way issue Opteron pipeline...
665
	movzb	`&lo("$s0")`,$acc0
666
	movzb	`&lo("$s1")`,$acc1
667
	movzb	`&lo("$s2")`,$acc2
668
	mov	0($sbox,$acc0,8),$t0
669
	mov	0($sbox,$acc1,8),$t1
670
	mov	0($sbox,$acc2,8),$t2
671
672
	movzb	`&hi("$s3")`,$acc0
673
	movzb	`&hi("$s0")`,$acc1
674
	movzb	`&lo("$s3")`,$acc2
675
	xor	3($sbox,$acc0,8),$t0
676
	xor	3($sbox,$acc1,8),$t1
677
	mov	0($sbox,$acc2,8),$t3
678
679
	movzb	`&hi("$s1")`,$acc0
680
	shr	\$16,$s0
681
	movzb	`&hi("$s2")`,$acc2
682
	xor	3($sbox,$acc0,8),$t2
683
	shr	\$16,$s3
684
	xor	3($sbox,$acc2,8),$t3
685
686
	shr	\$16,$s1
687
	lea	16($key),$key
688
	shr	\$16,$s2
689
690
	movzb	`&lo("$s2")`,$acc0
691
	movzb	`&lo("$s3")`,$acc1
692
	movzb	`&lo("$s0")`,$acc2
693
	xor	2($sbox,$acc0,8),$t0
694
	xor	2($sbox,$acc1,8),$t1
695
	xor	2($sbox,$acc2,8),$t2
696
697
	movzb	`&hi("$s1")`,$acc0
698
	movzb	`&hi("$s2")`,$acc1
699
	movzb	`&lo("$s1")`,$acc2
700
	xor	1($sbox,$acc0,8),$t0
701
	xor	1($sbox,$acc1,8),$t1
702
	xor	2($sbox,$acc2,8),$t3
703
704
	movzb	`&hi("$s3")`,$acc0
705
	mov	12($key),$s3
706
	movzb	`&hi("$s0")`,$acc2
707
	xor	1($sbox,$acc0,8),$t2
708
	mov	0($key),$s0
709
	xor	1($sbox,$acc2,8),$t3
710
711
	xor	$t0,$s0
712
	mov	4($key),$s1
713
	mov	8($key),$s2
714
	xor	$t2,$s2
715
	xor	$t1,$s1
716
	xor	$t3,$s3
717
___
718
}
719
720
sub declastvert()
721
{ my $t3="%r8d";	# zaps $inp!
722
723
$code.=<<___;
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
724
	lea	2048($sbox),$sbox	# size optimization
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
725
	movzb	`&lo("$s0")`,$acc0
726
	movzb	`&lo("$s1")`,$acc1
727
	movzb	`&lo("$s2")`,$acc2
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
728
	movzb	($sbox,$acc0,1),$t0
729
	movzb	($sbox,$acc1,1),$t1
730
	movzb	($sbox,$acc2,1),$t2
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
731
732
	movzb	`&lo("$s3")`,$acc0
733
	movzb	`&hi("$s3")`,$acc1
734
	movzb	`&hi("$s0")`,$acc2
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
735
	movzb	($sbox,$acc0,1),$t3
736
	movzb	($sbox,$acc1,1),$acc1	#$t0
737
	movzb	($sbox,$acc2,1),$acc2	#$t1
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
738
739
	shl	\$8,$acc1
740
	shl	\$8,$acc2
741
742
	xor	$acc1,$t0
743
	xor	$acc2,$t1
744
	shr	\$16,$s3
745
746
	movzb	`&hi("$s1")`,$acc0
747
	movzb	`&hi("$s2")`,$acc1
748
	shr	\$16,$s0
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
749
	movzb	($sbox,$acc0,1),$acc0	#$t2
750
	movzb	($sbox,$acc1,1),$acc1	#$t3
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
751
752
	shl	\$8,$acc0
753
	shl	\$8,$acc1
754
	shr	\$16,$s1
755
	xor	$acc0,$t2
756
	xor	$acc1,$t3
757
	shr	\$16,$s2
758
759
	movzb	`&lo("$s2")`,$acc0
760
	movzb	`&lo("$s3")`,$acc1
761
	movzb	`&lo("$s0")`,$acc2
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
762
	movzb	($sbox,$acc0,1),$acc0	#$t0
763
	movzb	($sbox,$acc1,1),$acc1	#$t1
764
	movzb	($sbox,$acc2,1),$acc2	#$t2
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
765
766
	shl	\$16,$acc0
767
	shl	\$16,$acc1
768
	shl	\$16,$acc2
769
770
	xor	$acc0,$t0
771
	xor	$acc1,$t1
772
	xor	$acc2,$t2
773
774
	movzb	`&lo("$s1")`,$acc0
775
	movzb	`&hi("$s1")`,$acc1
776
	movzb	`&hi("$s2")`,$acc2
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
777
	movzb	($sbox,$acc0,1),$acc0	#$t3
778
	movzb	($sbox,$acc1,1),$acc1	#$t0
779
	movzb	($sbox,$acc2,1),$acc2	#$t1
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
780
781
	shl	\$16,$acc0
782
	shl	\$24,$acc1
783
	shl	\$24,$acc2
784
785
	xor	$acc0,$t3
786
	xor	$acc1,$t0
787
	xor	$acc2,$t1
788
789
	movzb	`&hi("$s3")`,$acc0
790
	movzb	`&hi("$s0")`,$acc1
791
	mov	16+12($key),$s3
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
792
	movzb	($sbox,$acc0,1),$acc0	#$t2
793
	movzb	($sbox,$acc1,1),$acc1	#$t3
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
794
	mov	16+0($key),$s0
795
796
	shl	\$24,$acc0
797
	shl	\$24,$acc1
798
799
	xor	$acc0,$t2
800
	xor	$acc1,$t3
801
802
	mov	16+4($key),$s1
803
	mov	16+8($key),$s2
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
804
	lea	-2048($sbox),$sbox
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
805
	xor	$t0,$s0
806
	xor	$t1,$s1
807
	xor	$t2,$s2
808
	xor	$t3,$s3
809
___
810
}
811
812
sub decstep()
813
{ my ($i,@s) = @_;
814
  my $tmp0=$acc0;
815
  my $tmp1=$acc1;
816
  my $tmp2=$acc2;
817
  my $out=($t0,$t1,$t2,$s[0])[$i];
818
819
	$code.="	mov	$s[0],$out\n"		if ($i!=3);
820
			$tmp1=$s[2]			if ($i==3);
821
	$code.="	mov	$s[2],$tmp1\n"		if ($i!=3);
822
	$code.="	and	\$0xFF,$out\n";
823
824
	$code.="	mov	0($sbox,$out,8),$out\n";
825
	$code.="	shr	\$16,$tmp1\n";
826
			$tmp2=$s[3]			if ($i==3);
827
	$code.="	mov	$s[3],$tmp2\n"		if ($i!=3);
828
829
			$tmp0=$s[1]			if ($i==3);
830
	$code.="	movzb	".&hi($s[1]).",$tmp0\n";
831
	$code.="	and	\$0xFF,$tmp1\n";
832
	$code.="	shr	\$24,$tmp2\n";
833
834
	$code.="	xor	3($sbox,$tmp0,8),$out\n";
835
	$code.="	xor	2($sbox,$tmp1,8),$out\n";
836
	$code.="	xor	1($sbox,$tmp2,8),$out\n";
837
838
	$code.="	mov	$t2,$s[1]\n"		if ($i==3);
839
	$code.="	mov	$t1,$s[2]\n"		if ($i==3);
840
	$code.="	mov	$t0,$s[3]\n"		if ($i==3);
841
	$code.="\n";
842
}
843
844
sub declast()
845
{ my ($i,@s)=@_;
846
  my $tmp0=$acc0;
847
  my $tmp1=$acc1;
848
  my $tmp2=$acc2;
849
  my $out=($t0,$t1,$t2,$s[0])[$i];
850
851
	$code.="	mov	$s[0],$out\n"		if ($i!=3);
852
			$tmp1=$s[2]			if ($i==3);
853
	$code.="	mov	$s[2],$tmp1\n"		if ($i!=3);
854
	$code.="	and	\$0xFF,$out\n";
855
856
	$code.="	movzb	2048($sbox,$out,1),$out\n";
857
	$code.="	shr	\$16,$tmp1\n";
858
			$tmp2=$s[3]			if ($i==3);
859
	$code.="	mov	$s[3],$tmp2\n"		if ($i!=3);
860
861
			$tmp0=$s[1]			if ($i==3);
862
	$code.="	movzb	".&hi($s[1]).",$tmp0\n";
863
	$code.="	and	\$0xFF,$tmp1\n";
864
	$code.="	shr	\$24,$tmp2\n";
865
866
	$code.="	movzb	2048($sbox,$tmp0,1),$tmp0\n";
867
	$code.="	movzb	2048($sbox,$tmp1,1),$tmp1\n";
868
	$code.="	movzb	2048($sbox,$tmp2,1),$tmp2\n";
869
870
	$code.="	shl	\$8,$tmp0\n";
871
	$code.="	shl	\$16,$tmp1\n";
872
	$code.="	shl	\$24,$tmp2\n";
873
874
	$code.="	xor	$tmp0,$out\n";
875
	$code.="	mov	$t2,$s[1]\n"		if ($i==3);
876
	$code.="	xor	$tmp1,$out\n";
877
	$code.="	mov	$t1,$s[2]\n"		if ($i==3);
878
	$code.="	xor	$tmp2,$out\n";
879
	$code.="	mov	$t0,$s[3]\n"		if ($i==3);
880
	$code.="\n";
881
}
882
883
$code.=<<___;
884
.type	_x86_64_AES_decrypt,\@abi-omnipotent
885
.align	16
886
_x86_64_AES_decrypt:
887
	xor	0($key),$s0			# xor with key
888
	xor	4($key),$s1
889
	xor	8($key),$s2
890
	xor	12($key),$s3
891
892
	mov	240($key),$rnds			# load key->rounds
893
	sub	\$1,$rnds
894
	jmp	.Ldec_loop
895
.align	16
896
.Ldec_loop:
897
___
898
	if ($verticalspin) { &decvert(); }
899
	else {	&decstep(0,$s0,$s3,$s2,$s1);
900
		&decstep(1,$s1,$s0,$s3,$s2);
901
		&decstep(2,$s2,$s1,$s0,$s3);
902
		&decstep(3,$s3,$s2,$s1,$s0);
903
		$code.=<<___;
904
		lea	16($key),$key
905
		xor	0($key),$s0			# xor with key
906
		xor	4($key),$s1
907
		xor	8($key),$s2
908
		xor	12($key),$s3
909
___
910
	}
911
$code.=<<___;
912
	sub	\$1,$rnds
913
	jnz	.Ldec_loop
914
___
915
	if ($verticalspin) { &declastvert(); }
916
	else {	&declast(0,$s0,$s3,$s2,$s1);
917
		&declast(1,$s1,$s0,$s3,$s2);
918
		&declast(2,$s2,$s1,$s0,$s3);
919
		&declast(3,$s3,$s2,$s1,$s0);
920
		$code.=<<___;
921
		xor	16+0($key),$s0			# xor with key
922
		xor	16+4($key),$s1
923
		xor	16+8($key),$s2
924
		xor	16+12($key),$s3
925
___
926
	}
927
$code.=<<___;
928
	.byte	0xf3,0xc3			# rep ret
929
.size	_x86_64_AES_decrypt,.-_x86_64_AES_decrypt
930
___
931
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
932
sub deccompactvert()
933
{ my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d");
934
935
$code.=<<___;
936
	movzb	`&lo("$s0")`,$t0
937
	movzb	`&lo("$s1")`,$t1
938
	movzb	`&lo("$s2")`,$t2
939
	movzb	($sbox,$t0,1),$t0
940
	movzb	($sbox,$t1,1),$t1
941
	movzb	($sbox,$t2,1),$t2
942
943
	movzb	`&lo("$s3")`,$t3
944
	movzb	`&hi("$s3")`,$acc0
945
	movzb	`&hi("$s0")`,$acc1
946
	movzb	($sbox,$t3,1),$t3
947
	movzb	($sbox,$acc0,1),$t4	#$t0
948
	movzb	($sbox,$acc1,1),$t5	#$t1
949
950
	movzb	`&hi("$s1")`,$acc2
951
	movzb	`&hi("$s2")`,$acc0
952
	shr	\$16,$s2
953
	movzb	($sbox,$acc2,1),$acc2	#$t2
954
	movzb	($sbox,$acc0,1),$acc0	#$t3
955
	shr	\$16,$s3
956
957
	movzb	`&lo("$s2")`,$acc1
958
	shl	\$8,$t4
959
	shl	\$8,$t5
960
	movzb	($sbox,$acc1,1),$acc1	#$t0
961
	xor	$t4,$t0
962
	xor	$t5,$t1
963
964
	movzb	`&lo("$s3")`,$t4
965
	shr	\$16,$s0
966
	shr	\$16,$s1
967
	movzb	`&lo("$s0")`,$t5
968
	shl	\$8,$acc2
969
	shl	\$8,$acc0
970
	movzb	($sbox,$t4,1),$t4	#$t1
971
	movzb	($sbox,$t5,1),$t5	#$t2
972
	xor	$acc2,$t2
973
	xor	$acc0,$t3
974
975
	movzb	`&lo("$s1")`,$acc2
976
	movzb	`&hi("$s1")`,$acc0
977
	shl	\$16,$acc1
978
	movzb	($sbox,$acc2,1),$acc2	#$t3
979
	movzb	($sbox,$acc0,1),$acc0	#$t0
980
	xor	$acc1,$t0
981
982
	movzb	`&hi("$s2")`,$acc1
983
	shl	\$16,$t4
984
	shl	\$16,$t5
985
	movzb	($sbox,$acc1,1),$s1	#$t1
986
	xor	$t4,$t1
987
	xor	$t5,$t2
988
989
	movzb	`&hi("$s3")`,$acc1
990
	shr	\$8,$s0
991
	shl	\$16,$acc2
992
	movzb	($sbox,$acc1,1),$s2	#$t2
993
	movzb	($sbox,$s0,1),$s3	#$t3
994
	xor	$acc2,$t3
995
996
	shl	\$24,$acc0
997
	shl	\$24,$s1
998
	shl	\$24,$s2
999
	xor	$acc0,$t0
1000
	shl	\$24,$s3
1001
	xor	$t1,$s1
1002
	mov	$t0,$s0
1003
	xor	$t2,$s2
1004
	xor	$t3,$s3
1005
___
1006
}
1007
1008
# parallelized version! input is pair of 64-bit values: %rax=s1.s0
1009
# and %rcx=s3.s2, output is four 32-bit values in %eax=s0, %ebx=s1,
1010
# %ecx=s2 and %edx=s3.
1011
sub dectransform()
1012
{ my ($tp10,$tp20,$tp40,$tp80,$acc0)=("%rax","%r8", "%r9", "%r10","%rbx");
1013
  my ($tp18,$tp28,$tp48,$tp88,$acc8)=("%rcx","%r11","%r12","%r13","%rdx");
1014
  my $prefetch = shift;
1015
1016
$code.=<<___;
1017
	mov	$tp10,$acc0
1018
	mov	$tp18,$acc8
1019
	and	$mask80,$acc0
1020
	and	$mask80,$acc8
1021
	mov	$acc0,$tp40
1022
	mov	$acc8,$tp48
1023
	shr	\$7,$tp40
1024
	lea	($tp10,$tp10),$tp20
1025
	shr	\$7,$tp48
1026
	lea	($tp18,$tp18),$tp28
1027
	sub	$tp40,$acc0
1028
	sub	$tp48,$acc8
1029
	and	$maskfe,$tp20
1030
	and	$maskfe,$tp28
1031
	and	$mask1b,$acc0
1032
	and	$mask1b,$acc8
1033
	xor	$tp20,$acc0
1034
	xor	$tp28,$acc8
1035
	mov	$acc0,$tp20
1036
	mov	$acc8,$tp28
1037
1038
	and	$mask80,$acc0
1039
	and	$mask80,$acc8
1040
	mov	$acc0,$tp80
1041
	mov	$acc8,$tp88
1042
	shr	\$7,$tp80
1043
	lea	($tp20,$tp20),$tp40
1044
	shr	\$7,$tp88
1045
	lea	($tp28,$tp28),$tp48
1046
	sub	$tp80,$acc0
1047
	sub	$tp88,$acc8
1048
	and	$maskfe,$tp40
1049
	and	$maskfe,$tp48
1050
	and	$mask1b,$acc0
1051
	and	$mask1b,$acc8
1052
	xor	$tp40,$acc0
1053
	xor	$tp48,$acc8
1054
	mov	$acc0,$tp40
1055
	mov	$acc8,$tp48
1056
1057
	and	$mask80,$acc0
1058
	and	$mask80,$acc8
1059
	mov	$acc0,$tp80
1060
	mov	$acc8,$tp88
1061
	shr	\$7,$tp80
1062
	 xor	$tp10,$tp20		# tp2^=tp1
1063
	shr	\$7,$tp88
1064
	 xor	$tp18,$tp28		# tp2^=tp1
1065
	sub	$tp80,$acc0
1066
	sub	$tp88,$acc8
1067
	lea	($tp40,$tp40),$tp80
1068
	lea	($tp48,$tp48),$tp88
1069
	 xor	$tp10,$tp40		# tp4^=tp1
1070
	 xor	$tp18,$tp48		# tp4^=tp1
1071
	and	$maskfe,$tp80
1072
	and	$maskfe,$tp88
1073
	and	$mask1b,$acc0
1074
	and	$mask1b,$acc8
1075
	xor	$acc0,$tp80
1076
	xor	$acc8,$tp88
1077
1078
	xor	$tp80,$tp10		# tp1^=tp8
1079
	xor	$tp88,$tp18		# tp1^=tp8
1080
	xor	$tp80,$tp20		# tp2^tp1^=tp8
1081
	xor	$tp88,$tp28		# tp2^tp1^=tp8
1082
	mov	$tp10,$acc0
1083
	mov	$tp18,$acc8
1084
	xor	$tp80,$tp40		# tp4^tp1^=tp8
1085
	xor	$tp88,$tp48		# tp4^tp1^=tp8
1086
	shr	\$32,$acc0
1087
	shr	\$32,$acc8
1088
	xor	$tp20,$tp80		# tp8^=tp8^tp2^tp1=tp2^tp1
1089
	xor	$tp28,$tp88		# tp8^=tp8^tp2^tp1=tp2^tp1
1090
	rol	\$8,`&LO("$tp10")`	# ROTATE(tp1^tp8,8)
1091
	rol	\$8,`&LO("$tp18")`	# ROTATE(tp1^tp8,8)
1092
	xor	$tp40,$tp80		# tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2
1093
	xor	$tp48,$tp88		# tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2
1094
1095
	rol	\$8,`&LO("$acc0")`	# ROTATE(tp1^tp8,8)
1096
	rol	\$8,`&LO("$acc8")`	# ROTATE(tp1^tp8,8)
1097
	xor	`&LO("$tp80")`,`&LO("$tp10")`
1098
	xor	`&LO("$tp88")`,`&LO("$tp18")`
1099
	shr	\$32,$tp80
1100
	shr	\$32,$tp88
1101
	xor	`&LO("$tp80")`,`&LO("$acc0")`
1102
	xor	`&LO("$tp88")`,`&LO("$acc8")`
1103
1104
	mov	$tp20,$tp80
1105
	mov	$tp28,$tp88
1106
	shr	\$32,$tp80
1107
	shr	\$32,$tp88
1108
	rol	\$24,`&LO("$tp20")`	# ROTATE(tp2^tp1^tp8,24)
1109
	rol	\$24,`&LO("$tp28")`	# ROTATE(tp2^tp1^tp8,24)
1110
	rol	\$24,`&LO("$tp80")`	# ROTATE(tp2^tp1^tp8,24)
1111
	rol	\$24,`&LO("$tp88")`	# ROTATE(tp2^tp1^tp8,24)
1112
	xor	`&LO("$tp20")`,`&LO("$tp10")`
1113
	xor	`&LO("$tp28")`,`&LO("$tp18")`
1114
	mov	$tp40,$tp20
1115
	mov	$tp48,$tp28
1116
	xor	`&LO("$tp80")`,`&LO("$acc0")`
1117
	xor	`&LO("$tp88")`,`&LO("$acc8")`
1118
1119
	`"mov	0($sbox),$mask80"	if ($prefetch)`
1120
	shr	\$32,$tp20
1121
	shr	\$32,$tp28
1122
	`"mov	64($sbox),$maskfe"	if ($prefetch)`
1123
	rol	\$16,`&LO("$tp40")`	# ROTATE(tp4^tp1^tp8,16)
1124
	rol	\$16,`&LO("$tp48")`	# ROTATE(tp4^tp1^tp8,16)
1125
	`"mov	128($sbox),$mask1b"	if ($prefetch)`
1126
	rol	\$16,`&LO("$tp20")`	# ROTATE(tp4^tp1^tp8,16)
1127
	rol	\$16,`&LO("$tp28")`	# ROTATE(tp4^tp1^tp8,16)
1128
	`"mov	192($sbox),$tp80"	if ($prefetch)`
1129
	xor	`&LO("$tp40")`,`&LO("$tp10")`
1130
	xor	`&LO("$tp48")`,`&LO("$tp18")`
1131
	`"mov	256($sbox),$tp88"	if ($prefetch)`
1132
	xor	`&LO("$tp20")`,`&LO("$acc0")`
1133
	xor	`&LO("$tp28")`,`&LO("$acc8")`
1134
___
1135
}
1136
1137
$code.=<<___;
1138
.type	_x86_64_AES_decrypt_compact,\@abi-omnipotent
1139
.align	16
1140
_x86_64_AES_decrypt_compact:
1141
	lea	128($sbox),$inp			# size optimization
1142
	mov	0-128($inp),$acc1		# prefetch Td4
1143
	mov	32-128($inp),$acc2
1144
	mov	64-128($inp),$t0
1145
	mov	96-128($inp),$t1
1146
	mov	128-128($inp),$acc1
1147
	mov	160-128($inp),$acc2
1148
	mov	192-128($inp),$t0
1149
	mov	224-128($inp),$t1
1150
	jmp	.Ldec_loop_compact
1151
1152
.align	16
1153
.Ldec_loop_compact:
1154
		xor	0($key),$s0		# xor with key
1155
		xor	4($key),$s1
1156
		xor	8($key),$s2
1157
		xor	12($key),$s3
1158
		lea	16($key),$key
1159
___
1160
		&deccompactvert();
1161
$code.=<<___;
1162
		cmp	16(%rsp),$key
1163
		je	.Ldec_compact_done
1164
1165
		mov	256+0($sbox),$mask80
1166
		shl	\$32,%rbx
1167
		shl	\$32,%rdx
1168
		mov	256+8($sbox),$maskfe
1169
		or	%rbx,%rax
1170
		or	%rdx,%rcx
1171
		mov	256+16($sbox),$mask1b
1172
___
1173
		&dectransform(1);
1174
$code.=<<___;
1175
	jmp	.Ldec_loop_compact
1176
.align	16
1177
.Ldec_compact_done:
1178
	xor	0($key),$s0
1179
	xor	4($key),$s1
1180
	xor	8($key),$s2
1181
	xor	12($key),$s3
1182
	.byte	0xf3,0xc3			# rep ret
1183
.size	_x86_64_AES_decrypt_compact,.-_x86_64_AES_decrypt_compact
1184
___
1185
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1186
# void AES_decrypt (const void *inp,void *out,const AES_KEY *key);
1187
$code.=<<___;
1188
.globl	AES_decrypt
1189
.type	AES_decrypt,\@function,3
1190
.align	16
1.2.6 by Kurt Roeckx
Import upstream version 1.0.1
1191
.globl	asm_AES_decrypt
1192
.hidden	asm_AES_decrypt
1193
asm_AES_decrypt:
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1194
AES_decrypt:
1195
	push	%rbx
1196
	push	%rbp
1197
	push	%r12
1198
	push	%r13
1199
	push	%r14
1200
	push	%r15
1201
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1202
	# allocate frame "above" key schedule
1203
	mov	%rsp,%r10
1204
	lea	-63(%rdx),%rcx	# %rdx is key argument
1205
	and	\$-64,%rsp
1206
	sub	%rsp,%rcx
1207
	neg	%rcx
1208
	and	\$0x3c0,%rcx
1209
	sub	%rcx,%rsp
1210
	sub	\$32,%rsp
1211
1212
	mov	%rsi,16(%rsp)	# save out
1213
	mov	%r10,24(%rsp)	# save real stack pointer
1214
.Ldec_prologue:
1215
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1216
	mov	%rdx,$key
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1217
	mov	240($key),$rnds	# load rounds
1218
1219
	mov	0(%rdi),$s0	# load input vector
1220
	mov	4(%rdi),$s1
1221
	mov	8(%rdi),$s2
1222
	mov	12(%rdi),$s3
1223
1224
	shl	\$4,$rnds
1225
	lea	($key,$rnds),%rbp
1226
	mov	$key,(%rsp)	# key schedule
1227
	mov	%rbp,8(%rsp)	# end of key schedule
1228
1229
	# pick Td4 copy which can't "overlap" with stack frame or key schedule
1230
	lea	.LAES_Td+2048(%rip),$sbox
1231
	lea	768(%rsp),%rbp
1232
	sub	$sbox,%rbp
1233
	and	\$0x300,%rbp
1234
	lea	($sbox,%rbp),$sbox
1235
	shr	\$3,%rbp	# recall "magic" constants!
1236
	add	%rbp,$sbox
1237
1238
	call	_x86_64_AES_decrypt_compact
1239
1240
	mov	16(%rsp),$out	# restore out
1241
	mov	24(%rsp),%rsi	# restore saved stack pointer
1242
	mov	$s0,0($out)	# write output vector
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1243
	mov	$s1,4($out)
1244
	mov	$s2,8($out)
1245
	mov	$s3,12($out)
1246
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1247
	mov	(%rsi),%r15
1248
	mov	8(%rsi),%r14
1249
	mov	16(%rsi),%r13
1250
	mov	24(%rsi),%r12
1251
	mov	32(%rsi),%rbp
1252
	mov	40(%rsi),%rbx
1253
	lea	48(%rsi),%rsp
1254
.Ldec_epilogue:
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1255
	ret
1256
.size	AES_decrypt,.-AES_decrypt
1257
___
1258
#------------------------------------------------------------------#
1259
1260
sub enckey()
1261
{
1262
$code.=<<___;
1263
	movz	%dl,%esi		# rk[i]>>0
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1264
	movzb	-128(%rbp,%rsi),%ebx
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1265
	movz	%dh,%esi		# rk[i]>>8
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1266
	shl	\$24,%ebx
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1267
	xor	%ebx,%eax
1268
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1269
	movzb	-128(%rbp,%rsi),%ebx
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1270
	shr	\$16,%edx
1271
	movz	%dl,%esi		# rk[i]>>16
1272
	xor	%ebx,%eax
1273
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1274
	movzb	-128(%rbp,%rsi),%ebx
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1275
	movz	%dh,%esi		# rk[i]>>24
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1276
	shl	\$8,%ebx
1277
	xor	%ebx,%eax
1278
1279
	movzb	-128(%rbp,%rsi),%ebx
1280
	shl	\$16,%ebx
1281
	xor	%ebx,%eax
1282
1283
	xor	1024-128(%rbp,%rcx,4),%eax		# rcon
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1284
___
1285
}
1286
1.2.6 by Kurt Roeckx
Import upstream version 1.0.1
1287
# int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1288
#                        AES_KEY *key)
1289
$code.=<<___;
1.2.6 by Kurt Roeckx
Import upstream version 1.0.1
1290
.globl	private_AES_set_encrypt_key
1291
.type	private_AES_set_encrypt_key,\@function,3
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1292
.align	16
1.2.6 by Kurt Roeckx
Import upstream version 1.0.1
1293
private_AES_set_encrypt_key:
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1294
	push	%rbx
1295
	push	%rbp
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1296
	push	%r12			# redundant, but allows to share 
1297
	push	%r13			# exception handler...
1298
	push	%r14
1299
	push	%r15
1.1.10 by Kurt Roeckx
Import upstream version 0.9.8o
1300
	sub	\$8,%rsp
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1301
.Lenc_key_prologue:
1.1.10 by Kurt Roeckx
Import upstream version 0.9.8o
1302
1303
	call	_x86_64_AES_set_encrypt_key
1304
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1305
	mov	8(%rsp),%r15
1306
	mov	16(%rsp),%r14
1307
	mov	24(%rsp),%r13
1308
	mov	32(%rsp),%r12
1309
	mov	40(%rsp),%rbp
1310
	mov	48(%rsp),%rbx
1311
	add	\$56,%rsp
1312
.Lenc_key_epilogue:
1.1.10 by Kurt Roeckx
Import upstream version 0.9.8o
1313
	ret
1.2.6 by Kurt Roeckx
Import upstream version 1.0.1
1314
.size	private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
1.1.10 by Kurt Roeckx
Import upstream version 0.9.8o
1315
1316
.type	_x86_64_AES_set_encrypt_key,\@abi-omnipotent
1317
.align	16
1318
_x86_64_AES_set_encrypt_key:
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1319
	mov	%esi,%ecx			# %ecx=bits
1320
	mov	%rdi,%rsi			# %rsi=userKey
1321
	mov	%rdx,%rdi			# %rdi=key
1322
1323
	test	\$-1,%rsi
1324
	jz	.Lbadpointer
1325
	test	\$-1,%rdi
1326
	jz	.Lbadpointer
1327
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1328
	lea	.LAES_Te(%rip),%rbp
1329
	lea	2048+128(%rbp),%rbp
1330
1331
	# prefetch Te4
1332
	mov	0-128(%rbp),%eax
1333
	mov	32-128(%rbp),%ebx
1334
	mov	64-128(%rbp),%r8d
1335
	mov	96-128(%rbp),%edx
1336
	mov	128-128(%rbp),%eax
1337
	mov	160-128(%rbp),%ebx
1338
	mov	192-128(%rbp),%r8d
1339
	mov	224-128(%rbp),%edx
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1340
1341
	cmp	\$128,%ecx
1342
	je	.L10rounds
1343
	cmp	\$192,%ecx
1344
	je	.L12rounds
1345
	cmp	\$256,%ecx
1346
	je	.L14rounds
1347
	mov	\$-2,%rax			# invalid number of bits
1348
	jmp	.Lexit
1349
1350
.L10rounds:
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1351
	mov	0(%rsi),%rax			# copy first 4 dwords
1352
	mov	8(%rsi),%rdx
1353
	mov	%rax,0(%rdi)
1354
	mov	%rdx,8(%rdi)
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1355
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1356
	shr	\$32,%rdx
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1357
	xor	%ecx,%ecx
1358
	jmp	.L10shortcut
1359
.align	4
1360
.L10loop:
1361
		mov	0(%rdi),%eax			# rk[0]
1362
		mov	12(%rdi),%edx			# rk[3]
1363
.L10shortcut:
1364
___
1365
		&enckey	();
1366
$code.=<<___;
1367
		mov	%eax,16(%rdi)			# rk[4]
1368
		xor	4(%rdi),%eax
1369
		mov	%eax,20(%rdi)			# rk[5]
1370
		xor	8(%rdi),%eax
1371
		mov	%eax,24(%rdi)			# rk[6]
1372
		xor	12(%rdi),%eax
1373
		mov	%eax,28(%rdi)			# rk[7]
1374
		add	\$1,%ecx
1375
		lea	16(%rdi),%rdi
1376
		cmp	\$10,%ecx
1377
	jl	.L10loop
1378
1379
	movl	\$10,80(%rdi)			# setup number of rounds
1380
	xor	%rax,%rax
1381
	jmp	.Lexit
1382
1383
.L12rounds:
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1384
	mov	0(%rsi),%rax			# copy first 6 dwords
1385
	mov	8(%rsi),%rbx
1386
	mov	16(%rsi),%rdx
1387
	mov	%rax,0(%rdi)
1388
	mov	%rbx,8(%rdi)
1389
	mov	%rdx,16(%rdi)
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1390
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1391
	shr	\$32,%rdx
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1392
	xor	%ecx,%ecx
1393
	jmp	.L12shortcut
1394
.align	4
1395
.L12loop:
1396
		mov	0(%rdi),%eax			# rk[0]
1397
		mov	20(%rdi),%edx			# rk[5]
1398
.L12shortcut:
1399
___
1400
		&enckey	();
1401
$code.=<<___;
1402
		mov	%eax,24(%rdi)			# rk[6]
1403
		xor	4(%rdi),%eax
1404
		mov	%eax,28(%rdi)			# rk[7]
1405
		xor	8(%rdi),%eax
1406
		mov	%eax,32(%rdi)			# rk[8]
1407
		xor	12(%rdi),%eax
1408
		mov	%eax,36(%rdi)			# rk[9]
1409
1410
		cmp	\$7,%ecx
1411
		je	.L12break
1412
		add	\$1,%ecx
1413
1414
		xor	16(%rdi),%eax
1415
		mov	%eax,40(%rdi)			# rk[10]
1416
		xor	20(%rdi),%eax
1417
		mov	%eax,44(%rdi)			# rk[11]
1418
1419
		lea	24(%rdi),%rdi
1420
	jmp	.L12loop
1421
.L12break:
1422
	movl	\$12,72(%rdi)		# setup number of rounds
1423
	xor	%rax,%rax
1424
	jmp	.Lexit
1425
1426
.L14rounds:		
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1427
	mov	0(%rsi),%rax			# copy first 8 dwords
1428
	mov	8(%rsi),%rbx
1429
	mov	16(%rsi),%rcx
1430
	mov	24(%rsi),%rdx
1431
	mov	%rax,0(%rdi)
1432
	mov	%rbx,8(%rdi)
1433
	mov	%rcx,16(%rdi)
1434
	mov	%rdx,24(%rdi)
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1435
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1436
	shr	\$32,%rdx
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1437
	xor	%ecx,%ecx
1438
	jmp	.L14shortcut
1439
.align	4
1440
.L14loop:
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1441
		mov	0(%rdi),%eax			# rk[0]
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1442
		mov	28(%rdi),%edx			# rk[4]
1443
.L14shortcut:
1444
___
1445
		&enckey	();
1446
$code.=<<___;
1447
		mov	%eax,32(%rdi)			# rk[8]
1448
		xor	4(%rdi),%eax
1449
		mov	%eax,36(%rdi)			# rk[9]
1450
		xor	8(%rdi),%eax
1451
		mov	%eax,40(%rdi)			# rk[10]
1452
		xor	12(%rdi),%eax
1453
		mov	%eax,44(%rdi)			# rk[11]
1454
1455
		cmp	\$6,%ecx
1456
		je	.L14break
1457
		add	\$1,%ecx
1458
1459
		mov	%eax,%edx
1460
		mov	16(%rdi),%eax			# rk[4]
1461
		movz	%dl,%esi			# rk[11]>>0
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1462
		movzb	-128(%rbp,%rsi),%ebx
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1463
		movz	%dh,%esi			# rk[11]>>8
1464
		xor	%ebx,%eax
1465
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1466
		movzb	-128(%rbp,%rsi),%ebx
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1467
		shr	\$16,%edx
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1468
		shl	\$8,%ebx
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1469
		movz	%dl,%esi			# rk[11]>>16
1470
		xor	%ebx,%eax
1471
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1472
		movzb	-128(%rbp,%rsi),%ebx
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1473
		movz	%dh,%esi			# rk[11]>>24
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1474
		shl	\$16,%ebx
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1475
		xor	%ebx,%eax
1476
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1477
		movzb	-128(%rbp,%rsi),%ebx
1478
		shl	\$24,%ebx
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1479
		xor	%ebx,%eax
1480
1481
		mov	%eax,48(%rdi)			# rk[12]
1482
		xor	20(%rdi),%eax
1483
		mov	%eax,52(%rdi)			# rk[13]
1484
		xor	24(%rdi),%eax
1485
		mov	%eax,56(%rdi)			# rk[14]
1486
		xor	28(%rdi),%eax
1487
		mov	%eax,60(%rdi)			# rk[15]
1488
1489
		lea	32(%rdi),%rdi
1490
	jmp	.L14loop
1491
.L14break:
1492
	movl	\$14,48(%rdi)		# setup number of rounds
1493
	xor	%rax,%rax
1494
	jmp	.Lexit
1495
1496
.Lbadpointer:
1497
	mov	\$-1,%rax
1498
.Lexit:
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1499
	.byte	0xf3,0xc3			# rep ret
1.1.10 by Kurt Roeckx
Import upstream version 0.9.8o
1500
.size	_x86_64_AES_set_encrypt_key,.-_x86_64_AES_set_encrypt_key
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1501
___
1502
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1503
sub deckey_ref()
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1504
{ my ($i,$ptr,$te,$td) = @_;
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1505
  my ($tp1,$tp2,$tp4,$tp8,$acc)=("%eax","%ebx","%edi","%edx","%r8d");
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1506
$code.=<<___;
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1507
	mov	$i($ptr),$tp1
1508
	mov	$tp1,$acc
1509
	and	\$0x80808080,$acc
1510
	mov	$acc,$tp4
1511
	shr	\$7,$tp4
1512
	lea	0($tp1,$tp1),$tp2
1513
	sub	$tp4,$acc
1514
	and	\$0xfefefefe,$tp2
1515
	and	\$0x1b1b1b1b,$acc
1516
	xor	$tp2,$acc
1517
	mov	$acc,$tp2
1518
1519
	and	\$0x80808080,$acc
1520
	mov	$acc,$tp8
1521
	shr	\$7,$tp8
1522
	lea	0($tp2,$tp2),$tp4
1523
	sub	$tp8,$acc
1524
	and	\$0xfefefefe,$tp4
1525
	and	\$0x1b1b1b1b,$acc
1526
	 xor	$tp1,$tp2		# tp2^tp1
1527
	xor	$tp4,$acc
1528
	mov	$acc,$tp4
1529
1530
	and	\$0x80808080,$acc
1531
	mov	$acc,$tp8
1532
	shr	\$7,$tp8
1533
	sub	$tp8,$acc
1534
	lea	0($tp4,$tp4),$tp8
1535
	 xor	$tp1,$tp4		# tp4^tp1
1536
	and	\$0xfefefefe,$tp8
1537
	and	\$0x1b1b1b1b,$acc
1538
	xor	$acc,$tp8
1539
1540
	xor	$tp8,$tp1		# tp1^tp8
1541
	rol	\$8,$tp1		# ROTATE(tp1^tp8,8)
1542
	xor	$tp8,$tp2		# tp2^tp1^tp8
1543
	xor	$tp8,$tp4		# tp4^tp1^tp8
1544
	xor	$tp2,$tp8
1545
	xor	$tp4,$tp8		# tp8^(tp8^tp4^tp1)^(tp8^tp2^tp1)=tp8^tp4^tp2
1546
1547
	xor	$tp8,$tp1
1548
	rol	\$24,$tp2		# ROTATE(tp2^tp1^tp8,24)
1549
	xor	$tp2,$tp1
1550
	rol	\$16,$tp4		# ROTATE(tp4^tp1^tp8,16)
1551
	xor	$tp4,$tp1
1552
1553
	mov	$tp1,$i($ptr)
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1554
___
1555
}
1556
1.2.6 by Kurt Roeckx
Import upstream version 1.0.1
1557
# int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1558
#                        AES_KEY *key)
1559
$code.=<<___;
1.2.6 by Kurt Roeckx
Import upstream version 1.0.1
1560
.globl	private_AES_set_decrypt_key
1561
.type	private_AES_set_decrypt_key,\@function,3
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1562
.align	16
1.2.6 by Kurt Roeckx
Import upstream version 1.0.1
1563
private_AES_set_decrypt_key:
1.1.10 by Kurt Roeckx
Import upstream version 0.9.8o
1564
	push	%rbx
1565
	push	%rbp
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1566
	push	%r12
1567
	push	%r13
1568
	push	%r14
1569
	push	%r15
1.1.10 by Kurt Roeckx
Import upstream version 0.9.8o
1570
	push	%rdx			# save key schedule
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1571
.Ldec_key_prologue:
1.1.10 by Kurt Roeckx
Import upstream version 0.9.8o
1572
1573
	call	_x86_64_AES_set_encrypt_key
1574
	mov	(%rsp),%r8		# restore key schedule
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1575
	cmp	\$0,%eax
1.1.10 by Kurt Roeckx
Import upstream version 0.9.8o
1576
	jne	.Labort
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1577
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1578
	mov	240(%r8),%r14d		# pull number of rounds
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1579
	xor	%rdi,%rdi
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1580
	lea	(%rdi,%r14d,4),%rcx
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1581
	mov	%r8,%rsi
1582
	lea	(%r8,%rcx,4),%rdi	# pointer to last chunk
1583
.align	4
1584
.Linvert:
1585
		mov	0(%rsi),%rax
1586
		mov	8(%rsi),%rbx
1587
		mov	0(%rdi),%rcx
1588
		mov	8(%rdi),%rdx
1589
		mov	%rax,0(%rdi)
1590
		mov	%rbx,8(%rdi)
1591
		mov	%rcx,0(%rsi)
1592
		mov	%rdx,8(%rsi)
1593
		lea	16(%rsi),%rsi
1594
		lea	-16(%rdi),%rdi
1595
		cmp	%rsi,%rdi
1596
	jne	.Linvert
1597
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1598
	lea	.LAES_Te+2048+1024(%rip),%rax	# rcon
1599
1600
	mov	40(%rax),$mask80
1601
	mov	48(%rax),$maskfe
1602
	mov	56(%rax),$mask1b
1603
1604
	mov	%r8,$key
1605
	sub	\$1,%r14d
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1606
.align	4
1607
.Lpermute:
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1608
		lea	16($key),$key
1609
		mov	0($key),%rax
1610
		mov	8($key),%rcx
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1611
___
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1612
		&dectransform ();
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1613
$code.=<<___;
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1614
		mov	%eax,0($key)
1615
		mov	%ebx,4($key)
1616
		mov	%ecx,8($key)
1617
		mov	%edx,12($key)
1618
		sub	\$1,%r14d
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1619
	jnz	.Lpermute
1620
1621
	xor	%rax,%rax
1.1.10 by Kurt Roeckx
Import upstream version 0.9.8o
1622
.Labort:
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1623
	mov	8(%rsp),%r15
1624
	mov	16(%rsp),%r14
1625
	mov	24(%rsp),%r13
1626
	mov	32(%rsp),%r12
1627
	mov	40(%rsp),%rbp
1628
	mov	48(%rsp),%rbx
1629
	add	\$56,%rsp
1630
.Ldec_key_epilogue:
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1631
	ret
1.2.6 by Kurt Roeckx
Import upstream version 1.0.1
1632
.size	private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1633
___
1634
1635
# void AES_cbc_encrypt (const void char *inp, unsigned char *out,
1636
#			size_t length, const AES_KEY *key,
1637
#			unsigned char *ivp,const int enc);
1638
{
1639
# stack frame layout
1640
# -8(%rsp)		return address
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1641
my $keyp="0(%rsp)";		# one to pass as $key
1642
my $keyend="8(%rsp)";		# &(keyp->rd_key[4*keyp->rounds])
1643
my $_rsp="16(%rsp)";		# saved %rsp
1644
my $_inp="24(%rsp)";		# copy of 1st parameter, inp
1645
my $_out="32(%rsp)";		# copy of 2nd parameter, out
1646
my $_len="40(%rsp)";		# copy of 3rd parameter, length
1647
my $_key="48(%rsp)";		# copy of 4th parameter, key
1648
my $_ivp="56(%rsp)";		# copy of 5th parameter, ivp
1649
my $ivec="64(%rsp)";		# ivec[16]
1650
my $aes_key="80(%rsp)";		# copy of aes_key
1651
my $mark="80+240(%rsp)";	# copy of aes_key->rounds
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1652
1653
$code.=<<___;
1654
.globl	AES_cbc_encrypt
1655
.type	AES_cbc_encrypt,\@function,6
1656
.align	16
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1657
.extern	OPENSSL_ia32cap_P
1.2.6 by Kurt Roeckx
Import upstream version 1.0.1
1658
.globl	asm_AES_cbc_encrypt
1659
.hidden	asm_AES_cbc_encrypt
1660
asm_AES_cbc_encrypt:
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1661
AES_cbc_encrypt:
1662
	cmp	\$0,%rdx	# check length
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1663
	je	.Lcbc_epilogue
1664
	pushfq
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1665
	push	%rbx
1666
	push	%rbp
1667
	push	%r12
1668
	push	%r13
1669
	push	%r14
1670
	push	%r15
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1671
.Lcbc_prologue:
1672
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1673
	cld
1674
	mov	%r9d,%r9d	# clear upper half of enc
1675
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1676
	lea	.LAES_Te(%rip),$sbox
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1677
	cmp	\$0,%r9
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1678
	jne	.Lcbc_picked_te
1679
	lea	.LAES_Td(%rip),$sbox
1680
.Lcbc_picked_te:
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1681
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1682
	mov	OPENSSL_ia32cap_P(%rip),%r10d
1683
	cmp	\$$speed_limit,%rdx
1684
	jb	.Lcbc_slow_prologue
1685
	test	\$15,%rdx
1686
	jnz	.Lcbc_slow_prologue
1687
	bt	\$28,%r10d
1688
	jc	.Lcbc_slow_prologue
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1689
1690
	# allocate aligned stack frame...
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1691
	lea	-88-248(%rsp),$key
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1692
	and	\$-64,$key
1693
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1694
	# ... and make sure it doesn't alias with AES_T[ed] modulo 4096
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1695
	mov	$sbox,%r10
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1696
	lea	2304($sbox),%r11
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1697
	mov	$key,%r12
1698
	and	\$0xFFF,%r10	# s = $sbox&0xfff
1699
	and	\$0xFFF,%r11	# e = ($sbox+2048)&0xfff
1700
	and	\$0xFFF,%r12	# p = %rsp&0xfff
1701
1702
	cmp	%r11,%r12	# if (p=>e) %rsp =- (p-e);
1703
	jb	.Lcbc_te_break_out
1704
	sub	%r11,%r12
1705
	sub	%r12,$key
1706
	jmp	.Lcbc_te_ok
1707
.Lcbc_te_break_out:		# else %rsp -= (p-s)&0xfff + framesz
1708
	sub	%r10,%r12
1709
	and	\$0xFFF,%r12
1710
	add	\$320,%r12
1711
	sub	%r12,$key
1712
.align	4
1713
.Lcbc_te_ok:
1714
1715
	xchg	%rsp,$key
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1716
	#add	\$8,%rsp	# reserve for return address!
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1717
	mov	$key,$_rsp	# save %rsp
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1718
.Lcbc_fast_body:
1719
	mov	%rdi,$_inp	# save copy of inp
1720
	mov	%rsi,$_out	# save copy of out
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1721
	mov	%rdx,$_len	# save copy of len
1722
	mov	%rcx,$_key	# save copy of key
1723
	mov	%r8,$_ivp	# save copy of ivp
1724
	movl	\$0,$mark	# copy of aes_key->rounds = 0;
1725
	mov	%r8,%rbp	# rearrange input arguments
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1726
	mov	%r9,%rbx
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1727
	mov	%rsi,$out
1728
	mov	%rdi,$inp
1729
	mov	%rcx,$key
1730
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1731
	mov	240($key),%eax		# key->rounds
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1732
	# do we copy key schedule to stack?
1733
	mov	$key,%r10
1734
	sub	$sbox,%r10
1735
	and	\$0xfff,%r10
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1736
	cmp	\$2304,%r10
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1737
	jb	.Lcbc_do_ecopy
1738
	cmp	\$4096-248,%r10
1739
	jb	.Lcbc_skip_ecopy
1740
.align	4
1741
.Lcbc_do_ecopy:
1742
		mov	$key,%rsi
1743
		lea	$aes_key,%rdi
1744
		lea	$aes_key,$key
1745
		mov	\$240/8,%ecx
1746
		.long	0x90A548F3	# rep movsq
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1747
		mov	%eax,(%rdi)	# copy aes_key->rounds
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1748
.Lcbc_skip_ecopy:
1749
	mov	$key,$keyp	# save key pointer
1750
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1751
	mov	\$18,%ecx
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1752
.align	4
1753
.Lcbc_prefetch_te:
1754
		mov	0($sbox),%r10
1755
		mov	32($sbox),%r11
1756
		mov	64($sbox),%r12
1757
		mov	96($sbox),%r13
1758
		lea	128($sbox),$sbox
1759
		sub	\$1,%ecx
1760
	jnz	.Lcbc_prefetch_te
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1761
	lea	-2304($sbox),$sbox
1762
1763
	cmp	\$0,%rbx
1764
	je	.LFAST_DECRYPT
1765
1766
#----------------------------- ENCRYPT -----------------------------#
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1767
	mov	0(%rbp),$s0		# load iv
1768
	mov	4(%rbp),$s1
1769
	mov	8(%rbp),$s2
1770
	mov	12(%rbp),$s3
1771
1772
.align	4
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1773
.Lcbc_fast_enc_loop:
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1774
		xor	0($inp),$s0
1775
		xor	4($inp),$s1
1776
		xor	8($inp),$s2
1777
		xor	12($inp),$s3
1778
		mov	$keyp,$key	# restore key
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1779
		mov	$inp,$_inp	# if ($verticalspin) save inp
1780
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1781
		call	_x86_64_AES_encrypt
1782
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1783
		mov	$_inp,$inp	# if ($verticalspin) restore inp
1784
		mov	$_len,%r10
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1785
		mov	$s0,0($out)
1786
		mov	$s1,4($out)
1787
		mov	$s2,8($out)
1788
		mov	$s3,12($out)
1789
1790
		lea	16($inp),$inp
1791
		lea	16($out),$out
1792
		sub	\$16,%r10
1793
		test	\$-16,%r10
1794
		mov	%r10,$_len
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1795
	jnz	.Lcbc_fast_enc_loop
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1796
	mov	$_ivp,%rbp	# restore ivp
1797
	mov	$s0,0(%rbp)	# save ivec
1798
	mov	$s1,4(%rbp)
1799
	mov	$s2,8(%rbp)
1800
	mov	$s3,12(%rbp)
1801
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1802
	jmp	.Lcbc_fast_cleanup
1803
1804
#----------------------------- DECRYPT -----------------------------#
1805
.align	16
1806
.LFAST_DECRYPT:
1807
	cmp	$inp,$out
1808
	je	.Lcbc_fast_dec_in_place
1809
1810
	mov	%rbp,$ivec
1811
.align	4
1812
.Lcbc_fast_dec_loop:
1813
		mov	0($inp),$s0	# read input
1814
		mov	4($inp),$s1
1815
		mov	8($inp),$s2
1816
		mov	12($inp),$s3
1817
		mov	$keyp,$key	# restore key
1818
		mov	$inp,$_inp	# if ($verticalspin) save inp
1819
1820
		call	_x86_64_AES_decrypt
1821
1822
		mov	$ivec,%rbp	# load ivp
1823
		mov	$_inp,$inp	# if ($verticalspin) restore inp
1824
		mov	$_len,%r10	# load len
1825
		xor	0(%rbp),$s0	# xor iv
1826
		xor	4(%rbp),$s1
1827
		xor	8(%rbp),$s2
1828
		xor	12(%rbp),$s3
1829
		mov	$inp,%rbp	# current input, next iv
1830
1831
		sub	\$16,%r10
1832
		mov	%r10,$_len	# update len
1833
		mov	%rbp,$ivec	# update ivp
1834
1835
		mov	$s0,0($out)	# write output
1836
		mov	$s1,4($out)
1837
		mov	$s2,8($out)
1838
		mov	$s3,12($out)
1839
1840
		lea	16($inp),$inp
1841
		lea	16($out),$out
1842
	jnz	.Lcbc_fast_dec_loop
1843
	mov	$_ivp,%r12		# load user ivp
1844
	mov	0(%rbp),%r10		# load iv
1845
	mov	8(%rbp),%r11
1846
	mov	%r10,0(%r12)		# copy back to user
1847
	mov	%r11,8(%r12)
1848
	jmp	.Lcbc_fast_cleanup
1849
1850
.align	16
1851
.Lcbc_fast_dec_in_place:
1852
	mov	0(%rbp),%r10		# copy iv to stack
1853
	mov	8(%rbp),%r11
1854
	mov	%r10,0+$ivec
1855
	mov	%r11,8+$ivec
1856
.align	4
1857
.Lcbc_fast_dec_in_place_loop:
1858
		mov	0($inp),$s0	# load input
1859
		mov	4($inp),$s1
1860
		mov	8($inp),$s2
1861
		mov	12($inp),$s3
1862
		mov	$keyp,$key	# restore key
1863
		mov	$inp,$_inp	# if ($verticalspin) save inp
1864
1865
		call	_x86_64_AES_decrypt
1866
1867
		mov	$_inp,$inp	# if ($verticalspin) restore inp
1868
		mov	$_len,%r10
1869
		xor	0+$ivec,$s0
1870
		xor	4+$ivec,$s1
1871
		xor	8+$ivec,$s2
1872
		xor	12+$ivec,$s3
1873
1874
		mov	0($inp),%r11	# load input
1875
		mov	8($inp),%r12
1876
		sub	\$16,%r10
1877
		jz	.Lcbc_fast_dec_in_place_done
1878
1879
		mov	%r11,0+$ivec	# copy input to iv
1880
		mov	%r12,8+$ivec
1881
1882
		mov	$s0,0($out)	# save output [zaps input]
1883
		mov	$s1,4($out)
1884
		mov	$s2,8($out)
1885
		mov	$s3,12($out)
1886
1887
		lea	16($inp),$inp
1888
		lea	16($out),$out
1889
		mov	%r10,$_len
1890
	jmp	.Lcbc_fast_dec_in_place_loop
1891
.Lcbc_fast_dec_in_place_done:
1892
	mov	$_ivp,%rdi
1893
	mov	%r11,0(%rdi)	# copy iv back to user
1894
	mov	%r12,8(%rdi)
1895
1896
	mov	$s0,0($out)	# save output [zaps input]
1897
	mov	$s1,4($out)
1898
	mov	$s2,8($out)
1899
	mov	$s3,12($out)
1900
1901
.align	4
1902
.Lcbc_fast_cleanup:
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
1903
	cmpl	\$0,$mark	# was the key schedule copied?
1904
	lea	$aes_key,%rdi
1905
	je	.Lcbc_exit
1906
		mov	\$240/8,%ecx
1907
		xor	%rax,%rax
1908
		.long	0x90AB48F3	# rep stosq
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
1909
1910
	jmp	.Lcbc_exit
1911
1912
#--------------------------- SLOW ROUTINE ---------------------------#
1913
.align	16
1914
.Lcbc_slow_prologue:
1915
	# allocate aligned stack frame...
1916
	lea	-88(%rsp),%rbp
1917
	and	\$-64,%rbp
1918
	# ... just "above" key schedule
1919
	lea	-88-63(%rcx),%r10
1920
	sub	%rbp,%r10
1921
	neg	%r10
1922
	and	\$0x3c0,%r10
1923
	sub	%r10,%rbp
1924
1925
	xchg	%rsp,%rbp
1926
	#add	\$8,%rsp	# reserve for return address!
1927
	mov	%rbp,$_rsp	# save %rsp
1928
.Lcbc_slow_body:
1929
	#mov	%rdi,$_inp	# save copy of inp
1930
	#mov	%rsi,$_out	# save copy of out
1931
	#mov	%rdx,$_len	# save copy of len
1932
	#mov	%rcx,$_key	# save copy of key
1933
	mov	%r8,$_ivp	# save copy of ivp
1934
	mov	%r8,%rbp	# rearrange input arguments
1935
	mov	%r9,%rbx
1936
	mov	%rsi,$out
1937
	mov	%rdi,$inp
1938
	mov	%rcx,$key
1939
	mov	%rdx,%r10
1940
1941
	mov	240($key),%eax
1942
	mov	$key,$keyp	# save key pointer
1943
	shl	\$4,%eax
1944
	lea	($key,%rax),%rax
1945
	mov	%rax,$keyend
1946
1947
	# pick Te4 copy which can't "overlap" with stack frame or key scdedule
1948
	lea	2048($sbox),$sbox
1949
	lea	768-8(%rsp),%rax
1950
	sub	$sbox,%rax
1951
	and	\$0x300,%rax
1952
	lea	($sbox,%rax),$sbox
1953
1954
	cmp	\$0,%rbx
1955
	je	.LSLOW_DECRYPT
1956
1957
#--------------------------- SLOW ENCRYPT ---------------------------#
1958
	test	\$-16,%r10		# check upon length
1959
	mov	0(%rbp),$s0		# load iv
1960
	mov	4(%rbp),$s1
1961
	mov	8(%rbp),$s2
1962
	mov	12(%rbp),$s3
1963
	jz	.Lcbc_slow_enc_tail	# short input...
1964
1965
.align	4
1966
.Lcbc_slow_enc_loop:
1967
		xor	0($inp),$s0
1968
		xor	4($inp),$s1
1969
		xor	8($inp),$s2
1970
		xor	12($inp),$s3
1971
		mov	$keyp,$key	# restore key
1972
		mov	$inp,$_inp	# save inp
1973
		mov	$out,$_out	# save out
1974
		mov	%r10,$_len	# save len
1975
1976
		call	_x86_64_AES_encrypt_compact
1977
1978
		mov	$_inp,$inp	# restore inp
1979
		mov	$_out,$out	# restore out
1980
		mov	$_len,%r10	# restore len
1981
		mov	$s0,0($out)
1982
		mov	$s1,4($out)
1983
		mov	$s2,8($out)
1984
		mov	$s3,12($out)
1985
1986
		lea	16($inp),$inp
1987
		lea	16($out),$out
1988
		sub	\$16,%r10
1989
		test	\$-16,%r10
1990
	jnz	.Lcbc_slow_enc_loop
1991
	test	\$15,%r10
1992
	jnz	.Lcbc_slow_enc_tail
1993
	mov	$_ivp,%rbp	# restore ivp
1994
	mov	$s0,0(%rbp)	# save ivec
1995
	mov	$s1,4(%rbp)
1996
	mov	$s2,8(%rbp)
1997
	mov	$s3,12(%rbp)
1998
1999
	jmp	.Lcbc_exit
2000
2001
.align	4
2002
.Lcbc_slow_enc_tail:
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
2003
	mov	%rax,%r11
2004
	mov	%rcx,%r12
2005
	mov	%r10,%rcx
2006
	mov	$inp,%rsi
2007
	mov	$out,%rdi
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
2008
	.long	0x9066A4F3		# rep movsb
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
2009
	mov	\$16,%rcx		# zero tail
2010
	sub	%r10,%rcx
2011
	xor	%rax,%rax
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
2012
	.long	0x9066AAF3		# rep stosb
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
2013
	mov	$out,$inp		# this is not a mistake!
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
2014
	mov	\$16,%r10		# len=16
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
2015
	mov	%r11,%rax
2016
	mov	%r12,%rcx
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
2017
	jmp	.Lcbc_slow_enc_loop	# one more spin...
2018
#--------------------------- SLOW DECRYPT ---------------------------#
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
2019
.align	16
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
2020
.LSLOW_DECRYPT:
2021
	shr	\$3,%rax
2022
	add	%rax,$sbox		# recall "magic" constants!
2023
2024
	mov	0(%rbp),%r11		# copy iv to stack
2025
	mov	8(%rbp),%r12
2026
	mov	%r11,0+$ivec
2027
	mov	%r12,8+$ivec
2028
2029
.align	4
2030
.Lcbc_slow_dec_loop:
2031
		mov	0($inp),$s0	# load input
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
2032
		mov	4($inp),$s1
2033
		mov	8($inp),$s2
2034
		mov	12($inp),$s3
2035
		mov	$keyp,$key	# restore key
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
2036
		mov	$inp,$_inp	# save inp
2037
		mov	$out,$_out	# save out
2038
		mov	%r10,$_len	# save len
2039
2040
		call	_x86_64_AES_decrypt_compact
2041
2042
		mov	$_inp,$inp	# restore inp
2043
		mov	$_out,$out	# restore out
2044
		mov	$_len,%r10
2045
		xor	0+$ivec,$s0
2046
		xor	4+$ivec,$s1
2047
		xor	8+$ivec,$s2
2048
		xor	12+$ivec,$s3
2049
2050
		mov	0($inp),%r11	# load input
2051
		mov	8($inp),%r12
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
2052
		sub	\$16,%r10
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
2053
		jc	.Lcbc_slow_dec_partial
2054
		jz	.Lcbc_slow_dec_done
2055
2056
		mov	%r11,0+$ivec	# copy input to iv
2057
		mov	%r12,8+$ivec
2058
2059
		mov	$s0,0($out)	# save output [can zap input]
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
2060
		mov	$s1,4($out)
2061
		mov	$s2,8($out)
2062
		mov	$s3,12($out)
2063
2064
		lea	16($inp),$inp
2065
		lea	16($out),$out
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
2066
	jmp	.Lcbc_slow_dec_loop
2067
.Lcbc_slow_dec_done:
2068
	mov	$_ivp,%rdi
2069
	mov	%r11,0(%rdi)		# copy iv back to user
2070
	mov	%r12,8(%rdi)
2071
2072
	mov	$s0,0($out)		# save output [can zap input]
2073
	mov	$s1,4($out)
2074
	mov	$s2,8($out)
2075
	mov	$s3,12($out)
2076
2077
	jmp	.Lcbc_exit
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
2078
2079
.align	4
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
2080
.Lcbc_slow_dec_partial:
2081
	mov	$_ivp,%rdi
2082
	mov	%r11,0(%rdi)		# copy iv back to user
2083
	mov	%r12,8(%rdi)
2084
2085
	mov	$s0,0+$ivec		# save output to stack
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
2086
	mov	$s1,4+$ivec
2087
	mov	$s2,8+$ivec
2088
	mov	$s3,12+$ivec
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
2089
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
2090
	mov	$out,%rdi
2091
	lea	$ivec,%rsi
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
2092
	lea	16(%r10),%rcx
2093
	.long	0x9066A4F3	# rep movsb
2094
	jmp	.Lcbc_exit
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
2095
2096
.align	16
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
2097
.Lcbc_exit:
2098
	mov	$_rsp,%rsi
2099
	mov	(%rsi),%r15
2100
	mov	8(%rsi),%r14
2101
	mov	16(%rsi),%r13
2102
	mov	24(%rsi),%r12
2103
	mov	32(%rsi),%rbp
2104
	mov	40(%rsi),%rbx
2105
	lea	48(%rsi),%rsp
2106
.Lcbc_popfq:
2107
	popfq
2108
.Lcbc_epilogue:
2109
	ret
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
2110
.size	AES_cbc_encrypt,.-AES_cbc_encrypt
2111
___
2112
}
2113
2114
$code.=<<___;
2115
.align	64
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
2116
.LAES_Te:
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
2117
___
2118
	&_data_word(0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6);
2119
	&_data_word(0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591);
2120
	&_data_word(0x50303060, 0x03010102, 0xa96767ce, 0x7d2b2b56);
2121
	&_data_word(0x19fefee7, 0x62d7d7b5, 0xe6abab4d, 0x9a7676ec);
2122
	&_data_word(0x45caca8f, 0x9d82821f, 0x40c9c989, 0x877d7dfa);
2123
	&_data_word(0x15fafaef, 0xeb5959b2, 0xc947478e, 0x0bf0f0fb);
2124
	&_data_word(0xecadad41, 0x67d4d4b3, 0xfda2a25f, 0xeaafaf45);
2125
	&_data_word(0xbf9c9c23, 0xf7a4a453, 0x967272e4, 0x5bc0c09b);
2126
	&_data_word(0xc2b7b775, 0x1cfdfde1, 0xae93933d, 0x6a26264c);
2127
	&_data_word(0x5a36366c, 0x413f3f7e, 0x02f7f7f5, 0x4fcccc83);
2128
	&_data_word(0x5c343468, 0xf4a5a551, 0x34e5e5d1, 0x08f1f1f9);
2129
	&_data_word(0x937171e2, 0x73d8d8ab, 0x53313162, 0x3f15152a);
2130
	&_data_word(0x0c040408, 0x52c7c795, 0x65232346, 0x5ec3c39d);
2131
	&_data_word(0x28181830, 0xa1969637, 0x0f05050a, 0xb59a9a2f);
2132
	&_data_word(0x0907070e, 0x36121224, 0x9b80801b, 0x3de2e2df);
2133
	&_data_word(0x26ebebcd, 0x6927274e, 0xcdb2b27f, 0x9f7575ea);
2134
	&_data_word(0x1b090912, 0x9e83831d, 0x742c2c58, 0x2e1a1a34);
2135
	&_data_word(0x2d1b1b36, 0xb26e6edc, 0xee5a5ab4, 0xfba0a05b);
2136
	&_data_word(0xf65252a4, 0x4d3b3b76, 0x61d6d6b7, 0xceb3b37d);
2137
	&_data_word(0x7b292952, 0x3ee3e3dd, 0x712f2f5e, 0x97848413);
2138
	&_data_word(0xf55353a6, 0x68d1d1b9, 0x00000000, 0x2cededc1);
2139
	&_data_word(0x60202040, 0x1ffcfce3, 0xc8b1b179, 0xed5b5bb6);
2140
	&_data_word(0xbe6a6ad4, 0x46cbcb8d, 0xd9bebe67, 0x4b393972);
2141
	&_data_word(0xde4a4a94, 0xd44c4c98, 0xe85858b0, 0x4acfcf85);
2142
	&_data_word(0x6bd0d0bb, 0x2aefefc5, 0xe5aaaa4f, 0x16fbfbed);
2143
	&_data_word(0xc5434386, 0xd74d4d9a, 0x55333366, 0x94858511);
2144
	&_data_word(0xcf45458a, 0x10f9f9e9, 0x06020204, 0x817f7ffe);
2145
	&_data_word(0xf05050a0, 0x443c3c78, 0xba9f9f25, 0xe3a8a84b);
2146
	&_data_word(0xf35151a2, 0xfea3a35d, 0xc0404080, 0x8a8f8f05);
2147
	&_data_word(0xad92923f, 0xbc9d9d21, 0x48383870, 0x04f5f5f1);
2148
	&_data_word(0xdfbcbc63, 0xc1b6b677, 0x75dadaaf, 0x63212142);
2149
	&_data_word(0x30101020, 0x1affffe5, 0x0ef3f3fd, 0x6dd2d2bf);
2150
	&_data_word(0x4ccdcd81, 0x140c0c18, 0x35131326, 0x2fececc3);
2151
	&_data_word(0xe15f5fbe, 0xa2979735, 0xcc444488, 0x3917172e);
2152
	&_data_word(0x57c4c493, 0xf2a7a755, 0x827e7efc, 0x473d3d7a);
2153
	&_data_word(0xac6464c8, 0xe75d5dba, 0x2b191932, 0x957373e6);
2154
	&_data_word(0xa06060c0, 0x98818119, 0xd14f4f9e, 0x7fdcdca3);
2155
	&_data_word(0x66222244, 0x7e2a2a54, 0xab90903b, 0x8388880b);
2156
	&_data_word(0xca46468c, 0x29eeeec7, 0xd3b8b86b, 0x3c141428);
2157
	&_data_word(0x79dedea7, 0xe25e5ebc, 0x1d0b0b16, 0x76dbdbad);
2158
	&_data_word(0x3be0e0db, 0x56323264, 0x4e3a3a74, 0x1e0a0a14);
2159
	&_data_word(0xdb494992, 0x0a06060c, 0x6c242448, 0xe45c5cb8);
2160
	&_data_word(0x5dc2c29f, 0x6ed3d3bd, 0xefacac43, 0xa66262c4);
2161
	&_data_word(0xa8919139, 0xa4959531, 0x37e4e4d3, 0x8b7979f2);
2162
	&_data_word(0x32e7e7d5, 0x43c8c88b, 0x5937376e, 0xb76d6dda);
2163
	&_data_word(0x8c8d8d01, 0x64d5d5b1, 0xd24e4e9c, 0xe0a9a949);
2164
	&_data_word(0xb46c6cd8, 0xfa5656ac, 0x07f4f4f3, 0x25eaeacf);
2165
	&_data_word(0xaf6565ca, 0x8e7a7af4, 0xe9aeae47, 0x18080810);
2166
	&_data_word(0xd5baba6f, 0x887878f0, 0x6f25254a, 0x722e2e5c);
2167
	&_data_word(0x241c1c38, 0xf1a6a657, 0xc7b4b473, 0x51c6c697);
2168
	&_data_word(0x23e8e8cb, 0x7cdddda1, 0x9c7474e8, 0x211f1f3e);
2169
	&_data_word(0xdd4b4b96, 0xdcbdbd61, 0x868b8b0d, 0x858a8a0f);
2170
	&_data_word(0x907070e0, 0x423e3e7c, 0xc4b5b571, 0xaa6666cc);
2171
	&_data_word(0xd8484890, 0x05030306, 0x01f6f6f7, 0x120e0e1c);
2172
	&_data_word(0xa36161c2, 0x5f35356a, 0xf95757ae, 0xd0b9b969);
2173
	&_data_word(0x91868617, 0x58c1c199, 0x271d1d3a, 0xb99e9e27);
2174
	&_data_word(0x38e1e1d9, 0x13f8f8eb, 0xb398982b, 0x33111122);
2175
	&_data_word(0xbb6969d2, 0x70d9d9a9, 0x898e8e07, 0xa7949433);
2176
	&_data_word(0xb69b9b2d, 0x221e1e3c, 0x92878715, 0x20e9e9c9);
2177
	&_data_word(0x49cece87, 0xff5555aa, 0x78282850, 0x7adfdfa5);
2178
	&_data_word(0x8f8c8c03, 0xf8a1a159, 0x80898909, 0x170d0d1a);
2179
	&_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0);
2180
	&_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e);
2181
	&_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c);
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
2182
2183
#Te4	# four copies of Te4 to choose from to avoid L1 aliasing
2184
	&data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
2185
	&data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
2186
	&data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
2187
	&data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
2188
	&data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
2189
	&data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
2190
	&data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
2191
	&data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
2192
	&data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
2193
	&data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
2194
	&data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
2195
	&data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
2196
	&data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
2197
	&data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
2198
	&data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
2199
	&data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
2200
	&data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
2201
	&data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
2202
	&data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
2203
	&data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
2204
	&data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
2205
	&data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
2206
	&data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
2207
	&data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
2208
	&data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
2209
	&data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
2210
	&data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
2211
	&data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
2212
	&data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
2213
	&data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
2214
	&data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
2215
	&data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
2216
2217
	&data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
2218
	&data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
2219
	&data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
2220
	&data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
2221
	&data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
2222
	&data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
2223
	&data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
2224
	&data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
2225
	&data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
2226
	&data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
2227
	&data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
2228
	&data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
2229
	&data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
2230
	&data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
2231
	&data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
2232
	&data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
2233
	&data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
2234
	&data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
2235
	&data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
2236
	&data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
2237
	&data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
2238
	&data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
2239
	&data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
2240
	&data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
2241
	&data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
2242
	&data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
2243
	&data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
2244
	&data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
2245
	&data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
2246
	&data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
2247
	&data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
2248
	&data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
2249
2250
	&data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
2251
	&data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
2252
	&data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
2253
	&data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
2254
	&data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
2255
	&data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
2256
	&data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
2257
	&data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
2258
	&data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
2259
	&data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
2260
	&data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
2261
	&data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
2262
	&data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
2263
	&data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
2264
	&data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
2265
	&data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
2266
	&data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
2267
	&data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
2268
	&data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
2269
	&data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
2270
	&data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
2271
	&data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
2272
	&data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
2273
	&data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
2274
	&data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
2275
	&data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
2276
	&data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
2277
	&data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
2278
	&data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
2279
	&data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
2280
	&data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
2281
	&data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
2282
2283
	&data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
2284
	&data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
2285
	&data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
2286
	&data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
2287
	&data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
2288
	&data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
2289
	&data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
2290
	&data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
2291
	&data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
2292
	&data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
2293
	&data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
2294
	&data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
2295
	&data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
2296
	&data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
2297
	&data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
2298
	&data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
2299
	&data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
2300
	&data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
2301
	&data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
2302
	&data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
2303
	&data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
2304
	&data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
2305
	&data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
2306
	&data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
2307
	&data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
2308
	&data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
2309
	&data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
2310
	&data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
2311
	&data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
2312
	&data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
2313
	&data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
2314
	&data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
2315
#rcon:
2316
$code.=<<___;
2317
	.long	0x00000001, 0x00000002, 0x00000004, 0x00000008
2318
	.long	0x00000010, 0x00000020, 0x00000040, 0x00000080
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
2319
	.long	0x0000001b, 0x00000036, 0x80808080, 0x80808080
2320
	.long	0xfefefefe, 0xfefefefe, 0x1b1b1b1b, 0x1b1b1b1b
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
2321
___
2322
$code.=<<___;
2323
.align	64
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
2324
.LAES_Td:
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
2325
___
2326
	&_data_word(0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a);
2327
	&_data_word(0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b);
2328
	&_data_word(0x55fa3020, 0xf66d76ad, 0x9176cc88, 0x254c02f5);
2329
	&_data_word(0xfcd7e54f, 0xd7cb2ac5, 0x80443526, 0x8fa362b5);
2330
	&_data_word(0x495ab1de, 0x671bba25, 0x980eea45, 0xe1c0fe5d);
2331
	&_data_word(0x02752fc3, 0x12f04c81, 0xa397468d, 0xc6f9d36b);
2332
	&_data_word(0xe75f8f03, 0x959c9215, 0xeb7a6dbf, 0xda595295);
2333
	&_data_word(0x2d83bed4, 0xd3217458, 0x2969e049, 0x44c8c98e);
2334
	&_data_word(0x6a89c275, 0x78798ef4, 0x6b3e5899, 0xdd71b927);
2335
	&_data_word(0xb64fe1be, 0x17ad88f0, 0x66ac20c9, 0xb43ace7d);
2336
	&_data_word(0x184adf63, 0x82311ae5, 0x60335197, 0x457f5362);
2337
	&_data_word(0xe07764b1, 0x84ae6bbb, 0x1ca081fe, 0x942b08f9);
2338
	&_data_word(0x58684870, 0x19fd458f, 0x876cde94, 0xb7f87b52);
2339
	&_data_word(0x23d373ab, 0xe2024b72, 0x578f1fe3, 0x2aab5566);
2340
	&_data_word(0x0728ebb2, 0x03c2b52f, 0x9a7bc586, 0xa50837d3);
2341
	&_data_word(0xf2872830, 0xb2a5bf23, 0xba6a0302, 0x5c8216ed);
2342
	&_data_word(0x2b1ccf8a, 0x92b479a7, 0xf0f207f3, 0xa1e2694e);
2343
	&_data_word(0xcdf4da65, 0xd5be0506, 0x1f6234d1, 0x8afea6c4);
2344
	&_data_word(0x9d532e34, 0xa055f3a2, 0x32e18a05, 0x75ebf6a4);
2345
	&_data_word(0x39ec830b, 0xaaef6040, 0x069f715e, 0x51106ebd);
2346
	&_data_word(0xf98a213e, 0x3d06dd96, 0xae053edd, 0x46bde64d);
2347
	&_data_word(0xb58d5491, 0x055dc471, 0x6fd40604, 0xff155060);
2348
	&_data_word(0x24fb9819, 0x97e9bdd6, 0xcc434089, 0x779ed967);
2349
	&_data_word(0xbd42e8b0, 0x888b8907, 0x385b19e7, 0xdbeec879);
2350
	&_data_word(0x470a7ca1, 0xe90f427c, 0xc91e84f8, 0x00000000);
2351
	&_data_word(0x83868009, 0x48ed2b32, 0xac70111e, 0x4e725a6c);
2352
	&_data_word(0xfbff0efd, 0x5638850f, 0x1ed5ae3d, 0x27392d36);
2353
	&_data_word(0x64d90f0a, 0x21a65c68, 0xd1545b9b, 0x3a2e3624);
2354
	&_data_word(0xb1670a0c, 0x0fe75793, 0xd296eeb4, 0x9e919b1b);
2355
	&_data_word(0x4fc5c080, 0xa220dc61, 0x694b775a, 0x161a121c);
2356
	&_data_word(0x0aba93e2, 0xe52aa0c0, 0x43e0223c, 0x1d171b12);
2357
	&_data_word(0x0b0d090e, 0xadc78bf2, 0xb9a8b62d, 0xc8a91e14);
2358
	&_data_word(0x8519f157, 0x4c0775af, 0xbbdd99ee, 0xfd607fa3);
2359
	&_data_word(0x9f2601f7, 0xbcf5725c, 0xc53b6644, 0x347efb5b);
2360
	&_data_word(0x7629438b, 0xdcc623cb, 0x68fcedb6, 0x63f1e4b8);
2361
	&_data_word(0xcadc31d7, 0x10856342, 0x40229713, 0x2011c684);
2362
	&_data_word(0x7d244a85, 0xf83dbbd2, 0x1132f9ae, 0x6da129c7);
2363
	&_data_word(0x4b2f9e1d, 0xf330b2dc, 0xec52860d, 0xd0e3c177);
2364
	&_data_word(0x6c16b32b, 0x99b970a9, 0xfa489411, 0x2264e947);
2365
	&_data_word(0xc48cfca8, 0x1a3ff0a0, 0xd82c7d56, 0xef903322);
2366
	&_data_word(0xc74e4987, 0xc1d138d9, 0xfea2ca8c, 0x360bd498);
2367
	&_data_word(0xcf81f5a6, 0x28de7aa5, 0x268eb7da, 0xa4bfad3f);
2368
	&_data_word(0xe49d3a2c, 0x0d927850, 0x9bcc5f6a, 0x62467e54);
2369
	&_data_word(0xc2138df6, 0xe8b8d890, 0x5ef7392e, 0xf5afc382);
2370
	&_data_word(0xbe805d9f, 0x7c93d069, 0xa92dd56f, 0xb31225cf);
2371
	&_data_word(0x3b99acc8, 0xa77d1810, 0x6e639ce8, 0x7bbb3bdb);
2372
	&_data_word(0x097826cd, 0xf418596e, 0x01b79aec, 0xa89a4f83);
2373
	&_data_word(0x656e95e6, 0x7ee6ffaa, 0x08cfbc21, 0xe6e815ef);
2374
	&_data_word(0xd99be7ba, 0xce366f4a, 0xd4099fea, 0xd67cb029);
2375
	&_data_word(0xafb2a431, 0x31233f2a, 0x3094a5c6, 0xc066a235);
2376
	&_data_word(0x37bc4e74, 0xa6ca82fc, 0xb0d090e0, 0x15d8a733);
2377
	&_data_word(0x4a9804f1, 0xf7daec41, 0x0e50cd7f, 0x2ff69117);
2378
	&_data_word(0x8dd64d76, 0x4db0ef43, 0x544daacc, 0xdf0496e4);
2379
	&_data_word(0xe3b5d19e, 0x1b886a4c, 0xb81f2cc1, 0x7f516546);
2380
	&_data_word(0x04ea5e9d, 0x5d358c01, 0x737487fa, 0x2e410bfb);
2381
	&_data_word(0x5a1d67b3, 0x52d2db92, 0x335610e9, 0x1347d66d);
2382
	&_data_word(0x8c61d79a, 0x7a0ca137, 0x8e14f859, 0x893c13eb);
2383
	&_data_word(0xee27a9ce, 0x35c961b7, 0xede51ce1, 0x3cb1477a);
2384
	&_data_word(0x59dfd29c, 0x3f73f255, 0x79ce1418, 0xbf37c773);
2385
	&_data_word(0xeacdf753, 0x5baafd5f, 0x146f3ddf, 0x86db4478);
2386
	&_data_word(0x81f3afca, 0x3ec468b9, 0x2c342438, 0x5f40a3c2);
2387
	&_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff);
2388
	&_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664);
2389
	&_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0);
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
2390
2391
#Td4:	# four copies of Td4 to choose from to avoid L1 aliasing
2392
	&data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
2393
	&data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
2394
	&data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
2395
	&data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
2396
	&data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
2397
	&data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
2398
	&data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
2399
	&data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
2400
	&data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
2401
	&data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
2402
	&data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
2403
	&data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
2404
	&data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
2405
	&data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
2406
	&data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
2407
	&data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
2408
	&data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
2409
	&data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
2410
	&data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
2411
	&data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
2412
	&data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
2413
	&data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
2414
	&data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
2415
	&data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
2416
	&data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
2417
	&data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
2418
	&data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
2419
	&data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
2420
	&data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
2421
	&data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
2422
	&data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
2423
	&data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
2424
$code.=<<___;
2425
	.long	0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
2426
	.long	0x1b1b1b1b, 0x1b1b1b1b, 0, 0
2427
___
2428
	&data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
2429
	&data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
2430
	&data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
2431
	&data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
2432
	&data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
2433
	&data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
2434
	&data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
2435
	&data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
2436
	&data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
2437
	&data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
2438
	&data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
2439
	&data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
2440
	&data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
2441
	&data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
2442
	&data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
2443
	&data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
2444
	&data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
2445
	&data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
2446
	&data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
2447
	&data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
2448
	&data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
2449
	&data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
2450
	&data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
2451
	&data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
2452
	&data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
2453
	&data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
2454
	&data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
2455
	&data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
2456
	&data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
2457
	&data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
2458
	&data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
2459
	&data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
2460
$code.=<<___;
2461
	.long	0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
2462
	.long	0x1b1b1b1b, 0x1b1b1b1b, 0, 0
2463
___
2464
	&data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
2465
	&data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
2466
	&data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
2467
	&data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
2468
	&data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
2469
	&data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
2470
	&data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
2471
	&data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
2472
	&data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
2473
	&data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
2474
	&data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
2475
	&data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
2476
	&data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
2477
	&data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
2478
	&data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
2479
	&data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
2480
	&data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
2481
	&data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
2482
	&data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
2483
	&data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
2484
	&data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
2485
	&data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
2486
	&data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
2487
	&data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
2488
	&data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
2489
	&data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
2490
	&data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
2491
	&data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
2492
	&data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
2493
	&data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
2494
	&data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
2495
	&data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
2496
$code.=<<___;
2497
	.long	0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
2498
	.long	0x1b1b1b1b, 0x1b1b1b1b, 0, 0
2499
___
2500
	&data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
2501
	&data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
2502
	&data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
2503
	&data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
2504
	&data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
2505
	&data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
2506
	&data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
2507
	&data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
2508
	&data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
2509
	&data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
2510
	&data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
2511
	&data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
2512
	&data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
2513
	&data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
2514
	&data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
2515
	&data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
2516
	&data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
2517
	&data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
2518
	&data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
2519
	&data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
2520
	&data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
2521
	&data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
2522
	&data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
2523
	&data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
2524
	&data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
2525
	&data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
2526
	&data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
2527
	&data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
2528
	&data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
2529
	&data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
2530
	&data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
2531
	&data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
2532
$code.=<<___;
2533
	.long	0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
2534
	.long	0x1b1b1b1b, 0x1b1b1b1b, 0, 0
2535
.asciz  "AES for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
2536
.align	64
2537
___
2538
2539
# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2540
#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
2541
if ($win64) {
2542
$rec="%rcx";
2543
$frame="%rdx";
2544
$context="%r8";
2545
$disp="%r9";
2546
2547
$code.=<<___;
2548
.extern	__imp_RtlVirtualUnwind
2549
.type	block_se_handler,\@abi-omnipotent
2550
.align	16
2551
block_se_handler:
2552
	push	%rsi
2553
	push	%rdi
2554
	push	%rbx
2555
	push	%rbp
2556
	push	%r12
2557
	push	%r13
2558
	push	%r14
2559
	push	%r15
2560
	pushfq
2561
	sub	\$64,%rsp
2562
2563
	mov	120($context),%rax	# pull context->Rax
2564
	mov	248($context),%rbx	# pull context->Rip
2565
2566
	mov	8($disp),%rsi		# disp->ImageBase
2567
	mov	56($disp),%r11		# disp->HandlerData
2568
2569
	mov	0(%r11),%r10d		# HandlerData[0]
2570
	lea	(%rsi,%r10),%r10	# prologue label
2571
	cmp	%r10,%rbx		# context->Rip<prologue label
2572
	jb	.Lin_block_prologue
2573
2574
	mov	152($context),%rax	# pull context->Rsp
2575
2576
	mov	4(%r11),%r10d		# HandlerData[1]
2577
	lea	(%rsi,%r10),%r10	# epilogue label
2578
	cmp	%r10,%rbx		# context->Rip>=epilogue label
2579
	jae	.Lin_block_prologue
2580
2581
	mov	24(%rax),%rax		# pull saved real stack pointer
2582
	lea	48(%rax),%rax		# adjust...
2583
2584
	mov	-8(%rax),%rbx
2585
	mov	-16(%rax),%rbp
2586
	mov	-24(%rax),%r12
2587
	mov	-32(%rax),%r13
2588
	mov	-40(%rax),%r14
2589
	mov	-48(%rax),%r15
2590
	mov	%rbx,144($context)	# restore context->Rbx
2591
	mov	%rbp,160($context)	# restore context->Rbp
2592
	mov	%r12,216($context)	# restore context->R12
2593
	mov	%r13,224($context)	# restore context->R13
2594
	mov	%r14,232($context)	# restore context->R14
2595
	mov	%r15,240($context)	# restore context->R15
2596
2597
.Lin_block_prologue:
2598
	mov	8(%rax),%rdi
2599
	mov	16(%rax),%rsi
2600
	mov	%rax,152($context)	# restore context->Rsp
2601
	mov	%rsi,168($context)	# restore context->Rsi
2602
	mov	%rdi,176($context)	# restore context->Rdi
2603
2604
	jmp	.Lcommon_seh_exit
2605
.size	block_se_handler,.-block_se_handler
2606
2607
.type	key_se_handler,\@abi-omnipotent
2608
.align	16
2609
key_se_handler:
2610
	push	%rsi
2611
	push	%rdi
2612
	push	%rbx
2613
	push	%rbp
2614
	push	%r12
2615
	push	%r13
2616
	push	%r14
2617
	push	%r15
2618
	pushfq
2619
	sub	\$64,%rsp
2620
2621
	mov	120($context),%rax	# pull context->Rax
2622
	mov	248($context),%rbx	# pull context->Rip
2623
2624
	mov	8($disp),%rsi		# disp->ImageBase
2625
	mov	56($disp),%r11		# disp->HandlerData
2626
2627
	mov	0(%r11),%r10d		# HandlerData[0]
2628
	lea	(%rsi,%r10),%r10	# prologue label
2629
	cmp	%r10,%rbx		# context->Rip<prologue label
2630
	jb	.Lin_key_prologue
2631
2632
	mov	152($context),%rax	# pull context->Rsp
2633
2634
	mov	4(%r11),%r10d		# HandlerData[1]
2635
	lea	(%rsi,%r10),%r10	# epilogue label
2636
	cmp	%r10,%rbx		# context->Rip>=epilogue label
2637
	jae	.Lin_key_prologue
2638
2639
	lea	56(%rax),%rax
2640
2641
	mov	-8(%rax),%rbx
2642
	mov	-16(%rax),%rbp
2643
	mov	-24(%rax),%r12
2644
	mov	-32(%rax),%r13
2645
	mov	-40(%rax),%r14
2646
	mov	-48(%rax),%r15
2647
	mov	%rbx,144($context)	# restore context->Rbx
2648
	mov	%rbp,160($context)	# restore context->Rbp
2649
	mov	%r12,216($context)	# restore context->R12
2650
	mov	%r13,224($context)	# restore context->R13
2651
	mov	%r14,232($context)	# restore context->R14
2652
	mov	%r15,240($context)	# restore context->R15
2653
2654
.Lin_key_prologue:
2655
	mov	8(%rax),%rdi
2656
	mov	16(%rax),%rsi
2657
	mov	%rax,152($context)	# restore context->Rsp
2658
	mov	%rsi,168($context)	# restore context->Rsi
2659
	mov	%rdi,176($context)	# restore context->Rdi
2660
2661
	jmp	.Lcommon_seh_exit
2662
.size	key_se_handler,.-key_se_handler
2663
2664
.type	cbc_se_handler,\@abi-omnipotent
2665
.align	16
2666
cbc_se_handler:
2667
	push	%rsi
2668
	push	%rdi
2669
	push	%rbx
2670
	push	%rbp
2671
	push	%r12
2672
	push	%r13
2673
	push	%r14
2674
	push	%r15
2675
	pushfq
2676
	sub	\$64,%rsp
2677
2678
	mov	120($context),%rax	# pull context->Rax
2679
	mov	248($context),%rbx	# pull context->Rip
2680
2681
	lea	.Lcbc_prologue(%rip),%r10
2682
	cmp	%r10,%rbx		# context->Rip<.Lcbc_prologue
2683
	jb	.Lin_cbc_prologue
2684
2685
	lea	.Lcbc_fast_body(%rip),%r10
2686
	cmp	%r10,%rbx		# context->Rip<.Lcbc_fast_body
2687
	jb	.Lin_cbc_frame_setup
2688
2689
	lea	.Lcbc_slow_prologue(%rip),%r10
2690
	cmp	%r10,%rbx		# context->Rip<.Lcbc_slow_prologue
2691
	jb	.Lin_cbc_body
2692
2693
	lea	.Lcbc_slow_body(%rip),%r10
2694
	cmp	%r10,%rbx		# context->Rip<.Lcbc_slow_body
2695
	jb	.Lin_cbc_frame_setup
2696
2697
.Lin_cbc_body:
2698
	mov	152($context),%rax	# pull context->Rsp
2699
2700
	lea	.Lcbc_epilogue(%rip),%r10
2701
	cmp	%r10,%rbx		# context->Rip>=.Lcbc_epilogue
2702
	jae	.Lin_cbc_prologue
2703
2704
	lea	8(%rax),%rax
2705
2706
	lea	.Lcbc_popfq(%rip),%r10
2707
	cmp	%r10,%rbx		# context->Rip>=.Lcbc_popfq
2708
	jae	.Lin_cbc_prologue
2709
2710
	mov	`16-8`(%rax),%rax	# biased $_rsp
2711
	lea	56(%rax),%rax
2712
2713
.Lin_cbc_frame_setup:
2714
	mov	-16(%rax),%rbx
2715
	mov	-24(%rax),%rbp
2716
	mov	-32(%rax),%r12
2717
	mov	-40(%rax),%r13
2718
	mov	-48(%rax),%r14
2719
	mov	-56(%rax),%r15
2720
	mov	%rbx,144($context)	# restore context->Rbx
2721
	mov	%rbp,160($context)	# restore context->Rbp
2722
	mov	%r12,216($context)	# restore context->R12
2723
	mov	%r13,224($context)	# restore context->R13
2724
	mov	%r14,232($context)	# restore context->R14
2725
	mov	%r15,240($context)	# restore context->R15
2726
2727
.Lin_cbc_prologue:
2728
	mov	8(%rax),%rdi
2729
	mov	16(%rax),%rsi
2730
	mov	%rax,152($context)	# restore context->Rsp
2731
	mov	%rsi,168($context)	# restore context->Rsi
2732
	mov	%rdi,176($context)	# restore context->Rdi
2733
2734
.Lcommon_seh_exit:
2735
2736
	mov	40($disp),%rdi		# disp->ContextRecord
2737
	mov	$context,%rsi		# context
2738
	mov	\$`1232/8`,%ecx		# sizeof(CONTEXT)
2739
	.long	0xa548f3fc		# cld; rep movsq
2740
2741
	mov	$disp,%rsi
2742
	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
2743
	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
2744
	mov	0(%rsi),%r8		# arg3, disp->ControlPc
2745
	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
2746
	mov	40(%rsi),%r10		# disp->ContextRecord
2747
	lea	56(%rsi),%r11		# &disp->HandlerData
2748
	lea	24(%rsi),%r12		# &disp->EstablisherFrame
2749
	mov	%r10,32(%rsp)		# arg5
2750
	mov	%r11,40(%rsp)		# arg6
2751
	mov	%r12,48(%rsp)		# arg7
2752
	mov	%rcx,56(%rsp)		# arg8, (NULL)
2753
	call	*__imp_RtlVirtualUnwind(%rip)
2754
2755
	mov	\$1,%eax		# ExceptionContinueSearch
2756
	add	\$64,%rsp
2757
	popfq
2758
	pop	%r15
2759
	pop	%r14
2760
	pop	%r13
2761
	pop	%r12
2762
	pop	%rbp
2763
	pop	%rbx
2764
	pop	%rdi
2765
	pop	%rsi
2766
	ret
2767
.size	cbc_se_handler,.-cbc_se_handler
2768
2769
.section	.pdata
2770
.align	4
2771
	.rva	.LSEH_begin_AES_encrypt
2772
	.rva	.LSEH_end_AES_encrypt
2773
	.rva	.LSEH_info_AES_encrypt
2774
2775
	.rva	.LSEH_begin_AES_decrypt
2776
	.rva	.LSEH_end_AES_decrypt
2777
	.rva	.LSEH_info_AES_decrypt
2778
1.2.6 by Kurt Roeckx
Import upstream version 1.0.1
2779
	.rva	.LSEH_begin_private_AES_set_encrypt_key
2780
	.rva	.LSEH_end_private_AES_set_encrypt_key
2781
	.rva	.LSEH_info_private_AES_set_encrypt_key
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
2782
1.2.6 by Kurt Roeckx
Import upstream version 1.0.1
2783
	.rva	.LSEH_begin_private_AES_set_decrypt_key
2784
	.rva	.LSEH_end_private_AES_set_decrypt_key
2785
	.rva	.LSEH_info_private_AES_set_decrypt_key
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
2786
2787
	.rva	.LSEH_begin_AES_cbc_encrypt
2788
	.rva	.LSEH_end_AES_cbc_encrypt
2789
	.rva	.LSEH_info_AES_cbc_encrypt
2790
2791
.section	.xdata
2792
.align	8
2793
.LSEH_info_AES_encrypt:
2794
	.byte	9,0,0,0
2795
	.rva	block_se_handler
2796
	.rva	.Lenc_prologue,.Lenc_epilogue	# HandlerData[]
2797
.LSEH_info_AES_decrypt:
2798
	.byte	9,0,0,0
2799
	.rva	block_se_handler
2800
	.rva	.Ldec_prologue,.Ldec_epilogue	# HandlerData[]
1.2.6 by Kurt Roeckx
Import upstream version 1.0.1
2801
.LSEH_info_private_AES_set_encrypt_key:
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
2802
	.byte	9,0,0,0
2803
	.rva	key_se_handler
2804
	.rva	.Lenc_key_prologue,.Lenc_key_epilogue	# HandlerData[]
1.2.6 by Kurt Roeckx
Import upstream version 1.0.1
2805
.LSEH_info_private_AES_set_decrypt_key:
1.1.11 by Kurt Roeckx
Import upstream version 1.0.0c
2806
	.byte	9,0,0,0
2807
	.rva	key_se_handler
2808
	.rva	.Ldec_key_prologue,.Ldec_key_epilogue	# HandlerData[]
2809
.LSEH_info_AES_cbc_encrypt:
2810
	.byte	9,0,0,0
2811
	.rva	cbc_se_handler
2812
___
2813
}
1.1.7 by Kurt Roeckx
Import upstream version 0.9.8k
2814
2815
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
2816
2817
print $code;
2818
2819
close STDOUT;