~ubuntu-branches/ubuntu/maverick/openssl/maverick

« back to all changes in this revision

Viewing changes to crypto/aes/asm/aes-586.pl

  • Committer: Bazaar Package Importer
  • Author(s): Kurt Roeckx
  • Date: 2005-12-13 21:37:42 UTC
  • mto: (11.1.1 lenny)
  • mto: This revision was merged to the branch mainline in revision 4.
  • Revision ID: james.westby@ubuntu.com-20051213213742-d0ydaylf80l16bj1
Tags: upstream-0.9.8a
ImportĀ upstreamĀ versionĀ 0.9.8a

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
#!/usr/bin/env perl
 
2
#
 
3
# ====================================================================
 
4
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
 
5
# project. Rights for redistribution and usage in source and binary
 
6
# forms are granted according to the OpenSSL license.
 
7
# ====================================================================
 
8
#
 
9
# Version 3.4.
 
10
#
 
11
# You might fail to appreciate this module performance from the first
 
12
# try. If compared to "vanilla" linux-ia32-icc target, i.e. considered
 
13
# to be *the* best Intel C compiler without -KPIC, performance appears
 
14
# to be virtually identical... But try to re-configure with shared
 
15
# library support... Aha! Intel compiler "suddenly" lags behind by 30%
 
16
# [on P4, more on others]:-) And if compared to position-independent
 
17
# code generated by GNU C, this code performs *more* than *twice* as
 
18
# fast! Yes, all this buzz about PIC means that unlike other hand-
 
19
# coded implementations, this one was explicitly designed to be safe
 
20
# to use even in shared library context... This also means that this
 
21
# code isn't necessarily absolutely fastest "ever," because in order
 
22
# to achieve position independence an extra register has to be
 
23
# off-loaded to stack, which affects the benchmark result.
 
24
#
 
25
# Special note about instruction choice. Do you recall RC4_INT code
 
26
# performing poorly on P4? It might be the time to figure out why.
 
27
# RC4_INT code implies effective address calculations in base+offset*4
 
28
# form. Trouble is that it seems that offset scaling turned to be
 
29
# critical path... At least eliminating scaling resulted in 2.8x RC4
 
30
# performance improvement [as you might recall]. As AES code is hungry
 
31
# for scaling too, I [try to] avoid the latter by favoring off-by-2
 
32
# shifts and masking the result with 0xFF<<2 instead of "boring" 0xFF.
 
33
#
 
34
# As was shown by Dean Gaudet <dean@arctic.org>, the above note turned
 
35
# void. Performance improvement with off-by-2 shifts was observed on
 
36
# intermediate implementation, which was spilling yet another register
 
37
# to stack... Final offset*4 code below runs just a tad faster on P4,
 
38
# but exhibits up to 10% improvement on other cores.
 
39
#
 
40
# Second version is "monolithic" replacement for aes_core.c, which in
 
41
# addition to AES_[de|en]crypt implements AES_set_[de|en]cryption_key.
 
42
# This made it possible to implement little-endian variant of the
 
43
# algorithm without modifying the base C code. Motivating factor for
 
44
# the undertaken effort was that it appeared that in tight IA-32
 
45
# register window little-endian flavor could achieve slightly higher
 
46
# Instruction Level Parallelism, and it indeed resulted in up to 15%
 
47
# better performance on most recent ļæ½-archs...
 
48
#
 
49
# Third version adds AES_cbc_encrypt implementation, which resulted in
 
50
# up to 40% performance imrovement of CBC benchmark results. 40% was
 
51
# observed on P4 core, where "overall" imrovement coefficient, i.e. if
 
52
# compared to PIC generated by GCC and in CBC mode, was observed to be
 
53
# as large as 4x:-) CBC performance is virtually identical to ECB now
 
54
# and on some platforms even better, e.g. 17.6 "small" cycles/byte on
 
55
# Opteron, because certain function prologues and epilogues are
 
56
# effectively taken out of the loop...
 
57
#
 
58
# Version 3.2 implements compressed tables and prefetch of these tables
 
59
# in CBC[!] mode. Former means that 3/4 of table references are now
 
60
# misaligned, which unfortunately has negative impact on elder IA-32
 
61
# implementations, Pentium suffered 30% penalty, PIII - 10%.
 
62
#
 
63
# Version 3.3 avoids L1 cache aliasing between stack frame and
 
64
# S-boxes, and 3.4 - L1 cache aliasing even between key schedule. The
 
65
# latter is achieved by copying the key schedule to controlled place in
 
66
# stack. This unfortunately has rather strong impact on small block CBC
 
67
# performance, ~2x deterioration on 16-byte block if compared to 3.3.
 
68
#
 
69
# Current ECB performance numbers for 128-bit key in CPU cycles per
 
70
# processed byte [measure commonly used by AES benchmarkers] are:
 
71
#
 
72
#               small footprint         fully unrolled
 
73
# P4            24                      22
 
74
# AMD K8        20                      19
 
75
# PIII          25                      23
 
76
# Pentium       81                      78
 
77
 
 
78
push(@INC,"perlasm","../../perlasm");
 
79
require "x86asm.pl";
 
80
 
 
81
&asm_init($ARGV[0],"aes-586.pl",$ARGV[$#ARGV] eq "386");
 
82
 
 
83
$s0="eax";
 
84
$s1="ebx";
 
85
$s2="ecx";
 
86
$s3="edx";
 
87
$key="edi";
 
88
$acc="esi";
 
89
 
 
90
$compromise=0;          # $compromise=128 abstains from copying key
 
91
                        # schedule to stack when encrypting inputs
 
92
                        # shorter than 128 bytes at the cost of
 
93
                        # risksing aliasing with S-boxes. In return
 
94
                        # you get way better, up to +70%, small block
 
95
                        # performance.
 
96
$small_footprint=1;     # $small_footprint=1 code is ~5% slower [on
 
97
                        # recent ļæ½-archs], but ~5 times smaller!
 
98
                        # I favor compact code to minimize cache
 
99
                        # contention and in hope to "collect" 5% back
 
100
                        # in real-life applications...
 
101
$vertical_spin=0;       # shift "verticaly" defaults to 0, because of
 
102
                        # its proof-of-concept status...
 
103
 
 
104
# Note that there is no decvert(), as well as last encryption round is
 
105
# performed with "horizontal" shifts. This is because this "vertical"
 
106
# implementation [one which groups shifts on a given $s[i] to form a
 
107
# "column," unlike "horizontal" one, which groups shifts on different
 
108
# $s[i] to form a "row"] is work in progress. It was observed to run
 
109
# few percents faster on Intel cores, but not AMD. On AMD K8 core it's
 
110
# whole 12% slower:-( So we face a trade-off... Shall it be resolved
 
111
# some day? Till then the code is considered experimental and by
 
112
# default remains dormant...
 
113
 
 
114
sub encvert()
 
115
{ my ($te,@s) = @_;
 
116
  my $v0 = $acc, $v1 = $key;
 
117
 
 
118
        &mov    ($v0,$s[3]);                            # copy s3
 
119
        &mov    (&DWP(4,"esp"),$s[2]);                  # save s2
 
120
        &mov    ($v1,$s[0]);                            # copy s0
 
121
        &mov    (&DWP(8,"esp"),$s[1]);                  # save s1
 
122
 
 
123
        &movz   ($s[2],&HB($s[0]));
 
124
        &and    ($s[0],0xFF);
 
125
        &mov    ($s[0],&DWP(0,$te,$s[0],8));            # s0>>0
 
126
        &shr    ($v1,16);
 
127
        &mov    ($s[3],&DWP(3,$te,$s[2],8));            # s0>>8
 
128
        &movz   ($s[1],&HB($v1));
 
129
        &and    ($v1,0xFF);
 
130
        &mov    ($s[2],&DWP(2,$te,$v1,8));              # s0>>16
 
131
         &mov   ($v1,$v0);
 
132
        &mov    ($s[1],&DWP(1,$te,$s[1],8));            # s0>>24
 
133
 
 
134
        &and    ($v0,0xFF);
 
135
        &xor    ($s[3],&DWP(0,$te,$v0,8));              # s3>>0
 
136
        &movz   ($v0,&HB($v1));
 
137
        &shr    ($v1,16);
 
138
        &xor    ($s[2],&DWP(3,$te,$v0,8));              # s3>>8
 
139
        &movz   ($v0,&HB($v1));
 
140
        &and    ($v1,0xFF);
 
141
        &xor    ($s[1],&DWP(2,$te,$v1,8));              # s3>>16
 
142
         &mov   ($v1,&DWP(4,"esp"));                    # restore s2
 
143
        &xor    ($s[0],&DWP(1,$te,$v0,8));              # s3>>24
 
144
 
 
145
        &mov    ($v0,$v1);
 
146
        &and    ($v1,0xFF);
 
147
        &xor    ($s[2],&DWP(0,$te,$v1,8));              # s2>>0
 
148
        &movz   ($v1,&HB($v0));
 
149
        &shr    ($v0,16);
 
150
        &xor    ($s[1],&DWP(3,$te,$v1,8));              # s2>>8
 
151
        &movz   ($v1,&HB($v0));
 
152
        &and    ($v0,0xFF);
 
153
        &xor    ($s[0],&DWP(2,$te,$v0,8));              # s2>>16
 
154
         &mov   ($v0,&DWP(8,"esp"));                    # restore s1
 
155
        &xor    ($s[3],&DWP(1,$te,$v1,8));              # s2>>24
 
156
 
 
157
        &mov    ($v1,$v0);
 
158
        &and    ($v0,0xFF);
 
159
        &xor    ($s[1],&DWP(0,$te,$v0,8));              # s1>>0
 
160
        &movz   ($v0,&HB($v1));
 
161
        &shr    ($v1,16);
 
162
        &xor    ($s[0],&DWP(3,$te,$v0,8));              # s1>>8
 
163
        &movz   ($v0,&HB($v1));
 
164
        &and    ($v1,0xFF);
 
165
        &xor    ($s[3],&DWP(2,$te,$v1,8));              # s1>>16
 
166
         &mov   ($key,&DWP(12,"esp"));                  # reincarnate v1 as key
 
167
        &xor    ($s[2],&DWP(1,$te,$v0,8));              # s1>>24
 
168
}
 
169
 
 
170
sub encstep()
 
171
{ my ($i,$te,@s) = @_;
 
172
  my $tmp = $key;
 
173
  my $out = $i==3?$s[0]:$acc;
 
174
 
 
175
        # lines marked with #%e?x[i] denote "reordered" instructions...
 
176
        if ($i==3)  {   &mov    ($key,&DWP(12,"esp"));          }##%edx
 
177
        else        {   &mov    ($out,$s[0]);
 
178
                        &and    ($out,0xFF);                    }
 
179
        if ($i==1)  {   &shr    ($s[0],16);                     }#%ebx[1]
 
180
        if ($i==2)  {   &shr    ($s[0],24);                     }#%ecx[2]
 
181
                        &mov    ($out,&DWP(0,$te,$out,8));
 
182
 
 
183
        if ($i==3)  {   $tmp=$s[1];                             }##%eax
 
184
                        &movz   ($tmp,&HB($s[1]));
 
185
                        &xor    ($out,&DWP(3,$te,$tmp,8));
 
186
 
 
187
        if ($i==3)  {   $tmp=$s[2]; &mov ($s[1],&DWP(4,"esp")); }##%ebx
 
188
        else        {   &mov    ($tmp,$s[2]);
 
189
                        &shr    ($tmp,16);                      }
 
190
        if ($i==2)  {   &and    ($s[1],0xFF);                   }#%edx[2]
 
191
                        &and    ($tmp,0xFF);
 
192
                        &xor    ($out,&DWP(2,$te,$tmp,8));
 
193
 
 
194
        if ($i==3)  {   $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp")); }##%ecx
 
195
        elsif($i==2){   &movz   ($tmp,&HB($s[3]));              }#%ebx[2]
 
196
        else        {   &mov    ($tmp,$s[3]); 
 
197
                        &shr    ($tmp,24)                       }
 
198
                        &xor    ($out,&DWP(1,$te,$tmp,8));
 
199
        if ($i<2)   {   &mov    (&DWP(4+4*$i,"esp"),$out);      }
 
200
        if ($i==3)  {   &mov    ($s[3],$acc);                   }
 
201
                        &comment();
 
202
}
 
203
 
 
204
sub enclast()
 
205
{ my ($i,$te,@s)=@_;
 
206
  my $tmp = $key;
 
207
  my $out = $i==3?$s[0]:$acc;
 
208
 
 
209
        if ($i==3)  {   &mov    ($key,&DWP(12,"esp"));          }##%edx
 
210
        else        {   &mov    ($out,$s[0]);                   }
 
211
                        &and    ($out,0xFF);
 
212
        if ($i==1)  {   &shr    ($s[0],16);                     }#%ebx[1]
 
213
        if ($i==2)  {   &shr    ($s[0],24);                     }#%ecx[2]
 
214
                        &mov    ($out,&DWP(2,$te,$out,8));
 
215
                        &and    ($out,0x000000ff);
 
216
 
 
217
        if ($i==3)  {   $tmp=$s[1];                             }##%eax
 
218
                        &movz   ($tmp,&HB($s[1]));
 
219
                        &mov    ($tmp,&DWP(0,$te,$tmp,8));
 
220
                        &and    ($tmp,0x0000ff00);
 
221
                        &xor    ($out,$tmp);
 
222
 
 
223
        if ($i==3)  {   $tmp=$s[2]; &mov ($s[1],&DWP(4,"esp")); }##%ebx
 
224
        else        {   mov     ($tmp,$s[2]);
 
225
                        &shr    ($tmp,16);                      }
 
226
        if ($i==2)  {   &and    ($s[1],0xFF);                   }#%edx[2]
 
227
                        &and    ($tmp,0xFF);
 
228
                        &mov    ($tmp,&DWP(0,$te,$tmp,8));
 
229
                        &and    ($tmp,0x00ff0000);
 
230
                        &xor    ($out,$tmp);
 
231
 
 
232
        if ($i==3)  {   $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp")); }##%ecx
 
233
        elsif($i==2){   &movz   ($tmp,&HB($s[3]));              }#%ebx[2]
 
234
        else        {   &mov    ($tmp,$s[3]);
 
235
                        &shr    ($tmp,24);                      }
 
236
                        &mov    ($tmp,&DWP(2,$te,$tmp,8));
 
237
                        &and    ($tmp,0xff000000);
 
238
                        &xor    ($out,$tmp);
 
239
        if ($i<2)   {   &mov    (&DWP(4+4*$i,"esp"),$out);      }
 
240
        if ($i==3)  {   &mov    ($s[3],$acc);                   }
 
241
}
 
242
 
 
243
sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
 
244
 
 
245
&public_label("AES_Te");
 
246
&function_begin_B("_x86_AES_encrypt");
 
247
        if ($vertical_spin) {
 
248
                # I need high parts of volatile registers to be accessible...
 
249
                &exch   ($s1="edi",$key="ebx");
 
250
                &mov    ($s2="esi",$acc="ecx");
 
251
        }
 
252
 
 
253
        # note that caller is expected to allocate stack frame for me!
 
254
        &mov    (&DWP(12,"esp"),$key);          # save key
 
255
 
 
256
        &xor    ($s0,&DWP(0,$key));             # xor with key
 
257
        &xor    ($s1,&DWP(4,$key));
 
258
        &xor    ($s2,&DWP(8,$key));
 
259
        &xor    ($s3,&DWP(12,$key));
 
260
 
 
261
        &mov    ($acc,&DWP(240,$key));          # load key->rounds
 
262
 
 
263
        if ($small_footprint) {
 
264
            &lea        ($acc,&DWP(-2,$acc,$acc));
 
265
            &lea        ($acc,&DWP(0,$key,$acc,8));
 
266
            &mov        (&DWP(16,"esp"),$acc);  # end of key schedule
 
267
            &align      (4);
 
268
            &set_label("loop");
 
269
                if ($vertical_spin) {
 
270
                    &encvert("ebp",$s0,$s1,$s2,$s3);
 
271
                } else {
 
272
                    &encstep(0,"ebp",$s0,$s1,$s2,$s3);
 
273
                    &encstep(1,"ebp",$s1,$s2,$s3,$s0);
 
274
                    &encstep(2,"ebp",$s2,$s3,$s0,$s1);
 
275
                    &encstep(3,"ebp",$s3,$s0,$s1,$s2);
 
276
                }
 
277
                &add    ($key,16);              # advance rd_key
 
278
                &xor    ($s0,&DWP(0,$key));
 
279
                &xor    ($s1,&DWP(4,$key));
 
280
                &xor    ($s2,&DWP(8,$key));
 
281
                &xor    ($s3,&DWP(12,$key));
 
282
            &cmp        ($key,&DWP(16,"esp"));
 
283
            &mov        (&DWP(12,"esp"),$key);
 
284
            &jb         (&label("loop"));
 
285
        }
 
286
        else {
 
287
            &cmp        ($acc,10);
 
288
            &jle        (&label("10rounds"));
 
289
            &cmp        ($acc,12);
 
290
            &jle        (&label("12rounds"));
 
291
 
 
292
        &set_label("14rounds");
 
293
            for ($i=1;$i<3;$i++) {
 
294
                if ($vertical_spin) {
 
295
                    &encvert("ebp",$s0,$s1,$s2,$s3);
 
296
                } else {
 
297
                    &encstep(0,"ebp",$s0,$s1,$s2,$s3);
 
298
                    &encstep(1,"ebp",$s1,$s2,$s3,$s0);
 
299
                    &encstep(2,"ebp",$s2,$s3,$s0,$s1);
 
300
                    &encstep(3,"ebp",$s3,$s0,$s1,$s2);
 
301
                }
 
302
                &xor    ($s0,&DWP(16*$i+0,$key));
 
303
                &xor    ($s1,&DWP(16*$i+4,$key));
 
304
                &xor    ($s2,&DWP(16*$i+8,$key));
 
305
                &xor    ($s3,&DWP(16*$i+12,$key));
 
306
            }
 
307
            &add        ($key,32);
 
308
            &mov        (&DWP(12,"esp"),$key);  # advance rd_key
 
309
        &set_label("12rounds");
 
310
            for ($i=1;$i<3;$i++) {
 
311
                if ($vertical_spin) {
 
312
                    &encvert("ebp",$s0,$s1,$s2,$s3);
 
313
                } else {
 
314
                    &encstep(0,"ebp",$s0,$s1,$s2,$s3);
 
315
                    &encstep(1,"ebp",$s1,$s2,$s3,$s0);
 
316
                    &encstep(2,"ebp",$s2,$s3,$s0,$s1);
 
317
                    &encstep(3,"ebp",$s3,$s0,$s1,$s2);
 
318
                }
 
319
                &xor    ($s0,&DWP(16*$i+0,$key));
 
320
                &xor    ($s1,&DWP(16*$i+4,$key));
 
321
                &xor    ($s2,&DWP(16*$i+8,$key));
 
322
                &xor    ($s3,&DWP(16*$i+12,$key));
 
323
            }
 
324
            &add        ($key,32);
 
325
            &mov        (&DWP(12,"esp"),$key);  # advance rd_key
 
326
        &set_label("10rounds");
 
327
            for ($i=1;$i<10;$i++) {
 
328
                if ($vertical_spin) {
 
329
                    &encvert("ebp",$s0,$s1,$s2,$s3);
 
330
                } else {
 
331
                    &encstep(0,"ebp",$s0,$s1,$s2,$s3);
 
332
                    &encstep(1,"ebp",$s1,$s2,$s3,$s0);
 
333
                    &encstep(2,"ebp",$s2,$s3,$s0,$s1);
 
334
                    &encstep(3,"ebp",$s3,$s0,$s1,$s2);
 
335
                }
 
336
                &xor    ($s0,&DWP(16*$i+0,$key));
 
337
                &xor    ($s1,&DWP(16*$i+4,$key));
 
338
                &xor    ($s2,&DWP(16*$i+8,$key));
 
339
                &xor    ($s3,&DWP(16*$i+12,$key));
 
340
            }
 
341
        }
 
342
 
 
343
        if ($vertical_spin) {
 
344
            # "reincarnate" some registers for "horizontal" spin...
 
345
            &mov        ($s1="ebx",$key="edi");
 
346
            &mov        ($s2="ecx",$acc="esi");
 
347
        }
 
348
        &enclast(0,"ebp",$s0,$s1,$s2,$s3);
 
349
        &enclast(1,"ebp",$s1,$s2,$s3,$s0);
 
350
        &enclast(2,"ebp",$s2,$s3,$s0,$s1);
 
351
        &enclast(3,"ebp",$s3,$s0,$s1,$s2);
 
352
 
 
353
        &add    ($key,$small_footprint?16:160);
 
354
        &xor    ($s0,&DWP(0,$key));
 
355
        &xor    ($s1,&DWP(4,$key));
 
356
        &xor    ($s2,&DWP(8,$key));
 
357
        &xor    ($s3,&DWP(12,$key));
 
358
 
 
359
        &ret    ();
 
360
 
 
361
&set_label("AES_Te",64);        # Yes! I keep it in the code segment!
 
362
        &_data_word(0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6);
 
363
        &_data_word(0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591);
 
364
        &_data_word(0x50303060, 0x03010102, 0xa96767ce, 0x7d2b2b56);
 
365
        &_data_word(0x19fefee7, 0x62d7d7b5, 0xe6abab4d, 0x9a7676ec);
 
366
        &_data_word(0x45caca8f, 0x9d82821f, 0x40c9c989, 0x877d7dfa);
 
367
        &_data_word(0x15fafaef, 0xeb5959b2, 0xc947478e, 0x0bf0f0fb);
 
368
        &_data_word(0xecadad41, 0x67d4d4b3, 0xfda2a25f, 0xeaafaf45);
 
369
        &_data_word(0xbf9c9c23, 0xf7a4a453, 0x967272e4, 0x5bc0c09b);
 
370
        &_data_word(0xc2b7b775, 0x1cfdfde1, 0xae93933d, 0x6a26264c);
 
371
        &_data_word(0x5a36366c, 0x413f3f7e, 0x02f7f7f5, 0x4fcccc83);
 
372
        &_data_word(0x5c343468, 0xf4a5a551, 0x34e5e5d1, 0x08f1f1f9);
 
373
        &_data_word(0x937171e2, 0x73d8d8ab, 0x53313162, 0x3f15152a);
 
374
        &_data_word(0x0c040408, 0x52c7c795, 0x65232346, 0x5ec3c39d);
 
375
        &_data_word(0x28181830, 0xa1969637, 0x0f05050a, 0xb59a9a2f);
 
376
        &_data_word(0x0907070e, 0x36121224, 0x9b80801b, 0x3de2e2df);
 
377
        &_data_word(0x26ebebcd, 0x6927274e, 0xcdb2b27f, 0x9f7575ea);
 
378
        &_data_word(0x1b090912, 0x9e83831d, 0x742c2c58, 0x2e1a1a34);
 
379
        &_data_word(0x2d1b1b36, 0xb26e6edc, 0xee5a5ab4, 0xfba0a05b);
 
380
        &_data_word(0xf65252a4, 0x4d3b3b76, 0x61d6d6b7, 0xceb3b37d);
 
381
        &_data_word(0x7b292952, 0x3ee3e3dd, 0x712f2f5e, 0x97848413);
 
382
        &_data_word(0xf55353a6, 0x68d1d1b9, 0x00000000, 0x2cededc1);
 
383
        &_data_word(0x60202040, 0x1ffcfce3, 0xc8b1b179, 0xed5b5bb6);
 
384
        &_data_word(0xbe6a6ad4, 0x46cbcb8d, 0xd9bebe67, 0x4b393972);
 
385
        &_data_word(0xde4a4a94, 0xd44c4c98, 0xe85858b0, 0x4acfcf85);
 
386
        &_data_word(0x6bd0d0bb, 0x2aefefc5, 0xe5aaaa4f, 0x16fbfbed);
 
387
        &_data_word(0xc5434386, 0xd74d4d9a, 0x55333366, 0x94858511);
 
388
        &_data_word(0xcf45458a, 0x10f9f9e9, 0x06020204, 0x817f7ffe);
 
389
        &_data_word(0xf05050a0, 0x443c3c78, 0xba9f9f25, 0xe3a8a84b);
 
390
        &_data_word(0xf35151a2, 0xfea3a35d, 0xc0404080, 0x8a8f8f05);
 
391
        &_data_word(0xad92923f, 0xbc9d9d21, 0x48383870, 0x04f5f5f1);
 
392
        &_data_word(0xdfbcbc63, 0xc1b6b677, 0x75dadaaf, 0x63212142);
 
393
        &_data_word(0x30101020, 0x1affffe5, 0x0ef3f3fd, 0x6dd2d2bf);
 
394
        &_data_word(0x4ccdcd81, 0x140c0c18, 0x35131326, 0x2fececc3);
 
395
        &_data_word(0xe15f5fbe, 0xa2979735, 0xcc444488, 0x3917172e);
 
396
        &_data_word(0x57c4c493, 0xf2a7a755, 0x827e7efc, 0x473d3d7a);
 
397
        &_data_word(0xac6464c8, 0xe75d5dba, 0x2b191932, 0x957373e6);
 
398
        &_data_word(0xa06060c0, 0x98818119, 0xd14f4f9e, 0x7fdcdca3);
 
399
        &_data_word(0x66222244, 0x7e2a2a54, 0xab90903b, 0x8388880b);
 
400
        &_data_word(0xca46468c, 0x29eeeec7, 0xd3b8b86b, 0x3c141428);
 
401
        &_data_word(0x79dedea7, 0xe25e5ebc, 0x1d0b0b16, 0x76dbdbad);
 
402
        &_data_word(0x3be0e0db, 0x56323264, 0x4e3a3a74, 0x1e0a0a14);
 
403
        &_data_word(0xdb494992, 0x0a06060c, 0x6c242448, 0xe45c5cb8);
 
404
        &_data_word(0x5dc2c29f, 0x6ed3d3bd, 0xefacac43, 0xa66262c4);
 
405
        &_data_word(0xa8919139, 0xa4959531, 0x37e4e4d3, 0x8b7979f2);
 
406
        &_data_word(0x32e7e7d5, 0x43c8c88b, 0x5937376e, 0xb76d6dda);
 
407
        &_data_word(0x8c8d8d01, 0x64d5d5b1, 0xd24e4e9c, 0xe0a9a949);
 
408
        &_data_word(0xb46c6cd8, 0xfa5656ac, 0x07f4f4f3, 0x25eaeacf);
 
409
        &_data_word(0xaf6565ca, 0x8e7a7af4, 0xe9aeae47, 0x18080810);
 
410
        &_data_word(0xd5baba6f, 0x887878f0, 0x6f25254a, 0x722e2e5c);
 
411
        &_data_word(0x241c1c38, 0xf1a6a657, 0xc7b4b473, 0x51c6c697);
 
412
        &_data_word(0x23e8e8cb, 0x7cdddda1, 0x9c7474e8, 0x211f1f3e);
 
413
        &_data_word(0xdd4b4b96, 0xdcbdbd61, 0x868b8b0d, 0x858a8a0f);
 
414
        &_data_word(0x907070e0, 0x423e3e7c, 0xc4b5b571, 0xaa6666cc);
 
415
        &_data_word(0xd8484890, 0x05030306, 0x01f6f6f7, 0x120e0e1c);
 
416
        &_data_word(0xa36161c2, 0x5f35356a, 0xf95757ae, 0xd0b9b969);
 
417
        &_data_word(0x91868617, 0x58c1c199, 0x271d1d3a, 0xb99e9e27);
 
418
        &_data_word(0x38e1e1d9, 0x13f8f8eb, 0xb398982b, 0x33111122);
 
419
        &_data_word(0xbb6969d2, 0x70d9d9a9, 0x898e8e07, 0xa7949433);
 
420
        &_data_word(0xb69b9b2d, 0x221e1e3c, 0x92878715, 0x20e9e9c9);
 
421
        &_data_word(0x49cece87, 0xff5555aa, 0x78282850, 0x7adfdfa5);
 
422
        &_data_word(0x8f8c8c03, 0xf8a1a159, 0x80898909, 0x170d0d1a);
 
423
        &_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0);
 
424
        &_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e);
 
425
        &_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c);
 
426
#rcon:
 
427
        &data_word(0x00000001, 0x00000002, 0x00000004, 0x00000008);
 
428
        &data_word(0x00000010, 0x00000020, 0x00000040, 0x00000080);
 
429
        &data_word(0x0000001b, 0x00000036, 0, 0, 0, 0, 0, 0);
 
430
&function_end_B("_x86_AES_encrypt");
 
431
 
 
432
# void AES_encrypt (const void *inp,void *out,const AES_KEY *key);
 
433
&public_label("AES_Te");
 
434
&function_begin("AES_encrypt");
 
435
        &mov    ($acc,&wparam(0));              # load inp
 
436
        &mov    ($key,&wparam(2));              # load key
 
437
 
 
438
        &mov    ($s0,"esp");
 
439
        &sub    ("esp",24);
 
440
        &and    ("esp",-64);
 
441
        &add    ("esp",4);
 
442
        &mov    (&DWP(16,"esp"),$s0);
 
443
 
 
444
        &call   (&label("pic_point"));          # make it PIC!
 
445
        &set_label("pic_point");
 
446
        &blindpop("ebp");
 
447
        &lea    ("ebp",&DWP(&label("AES_Te")."-".&label("pic_point"),"ebp"));
 
448
 
 
449
        &mov    ($s0,&DWP(0,$acc));             # load input data
 
450
        &mov    ($s1,&DWP(4,$acc));
 
451
        &mov    ($s2,&DWP(8,$acc));
 
452
        &mov    ($s3,&DWP(12,$acc));
 
453
 
 
454
        &call   ("_x86_AES_encrypt");
 
455
 
 
456
        &mov    ("esp",&DWP(16,"esp"));
 
457
 
 
458
        &mov    ($acc,&wparam(1));              # load out
 
459
        &mov    (&DWP(0,$acc),$s0);             # write output data
 
460
        &mov    (&DWP(4,$acc),$s1);
 
461
        &mov    (&DWP(8,$acc),$s2);
 
462
        &mov    (&DWP(12,$acc),$s3);
 
463
&function_end("AES_encrypt");
 
464
 
 
465
#------------------------------------------------------------------#
 
466
 
 
467
sub decstep()
 
468
{ my ($i,$td,@s) = @_;
 
469
  my $tmp = $key;
 
470
  my $out = $i==3?$s[0]:$acc;
 
471
 
 
472
        # no instructions are reordered, as performance appears
 
473
        # optimal... or rather that all attempts to reorder didn't
 
474
        # result in better performance [which by the way is not a
 
475
        # bit lower than ecryption].
 
476
        if($i==3)   {   &mov    ($key,&DWP(12,"esp"));          }
 
477
        else        {   &mov    ($out,$s[0]);                   }
 
478
                        &and    ($out,0xFF);
 
479
                        &mov    ($out,&DWP(0,$td,$out,8));
 
480
 
 
481
        if ($i==3)  {   $tmp=$s[1];                             }
 
482
                        &movz   ($tmp,&HB($s[1]));
 
483
                        &xor    ($out,&DWP(3,$td,$tmp,8));
 
484
 
 
485
        if ($i==3)  {   $tmp=$s[2]; &mov ($s[1],$acc);          }
 
486
        else        {   &mov    ($tmp,$s[2]);                   }
 
487
                        &shr    ($tmp,16);
 
488
                        &and    ($tmp,0xFF);
 
489
                        &xor    ($out,&DWP(2,$td,$tmp,8));
 
490
 
 
491
        if ($i==3)  {   $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp")); }
 
492
        else        {   &mov    ($tmp,$s[3]);                   }
 
493
                        &shr    ($tmp,24);
 
494
                        &xor    ($out,&DWP(1,$td,$tmp,8));
 
495
        if ($i<2)   {   &mov    (&DWP(4+4*$i,"esp"),$out);      }
 
496
        if ($i==3)  {   &mov    ($s[3],&DWP(4,"esp"));          }
 
497
                        &comment();
 
498
}
 
499
 
 
500
sub declast()
 
501
{ my ($i,$td,@s)=@_;
 
502
  my $tmp = $key;
 
503
  my $out = $i==3?$s[0]:$acc;
 
504
 
 
505
        if($i==3)   {   &mov    ($key,&DWP(12,"esp"));          }
 
506
        else        {   &mov    ($out,$s[0]);                   }
 
507
                        &and    ($out,0xFF);
 
508
                        &mov    ($out,&DWP(2048,$td,$out,4));
 
509
                        &and    ($out,0x000000ff);
 
510
 
 
511
        if ($i==3)  {   $tmp=$s[1];                             }
 
512
                        &movz   ($tmp,&HB($s[1]));
 
513
                        &mov    ($tmp,&DWP(2048,$td,$tmp,4));
 
514
                        &and    ($tmp,0x0000ff00);
 
515
                        &xor    ($out,$tmp);
 
516
 
 
517
        if ($i==3)  {   $tmp=$s[2]; &mov ($s[1],$acc);          }
 
518
        else        {   mov     ($tmp,$s[2]);                   }
 
519
                        &shr    ($tmp,16);
 
520
                        &and    ($tmp,0xFF);
 
521
                        &mov    ($tmp,&DWP(2048,$td,$tmp,4));
 
522
                        &and    ($tmp,0x00ff0000);
 
523
                        &xor    ($out,$tmp);
 
524
 
 
525
        if ($i==3)  {   $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp")); }
 
526
        else        {   &mov    ($tmp,$s[3]);                   }
 
527
                        &shr    ($tmp,24);
 
528
                        &mov    ($tmp,&DWP(2048,$td,$tmp,4));
 
529
                        &and    ($tmp,0xff000000);
 
530
                        &xor    ($out,$tmp);
 
531
        if ($i<2)   {   &mov    (&DWP(4+4*$i,"esp"),$out);      }
 
532
        if ($i==3)  {   &mov    ($s[3],&DWP(4,"esp"));          }
 
533
}
 
534
 
 
535
&public_label("AES_Td");
 
536
&function_begin_B("_x86_AES_decrypt");
 
537
        # note that caller is expected to allocate stack frame for me!
 
538
        &mov    (&DWP(12,"esp"),$key);          # save key
 
539
 
 
540
        &xor    ($s0,&DWP(0,$key));             # xor with key
 
541
        &xor    ($s1,&DWP(4,$key));
 
542
        &xor    ($s2,&DWP(8,$key));
 
543
        &xor    ($s3,&DWP(12,$key));
 
544
 
 
545
        &mov    ($acc,&DWP(240,$key));          # load key->rounds
 
546
 
 
547
        if ($small_footprint) {
 
548
            &lea        ($acc,&DWP(-2,$acc,$acc));
 
549
            &lea        ($acc,&DWP(0,$key,$acc,8));
 
550
            &mov        (&DWP(16,"esp"),$acc);  # end of key schedule
 
551
            &align      (4);
 
552
            &set_label("loop");
 
553
                &decstep(0,"ebp",$s0,$s3,$s2,$s1);
 
554
                &decstep(1,"ebp",$s1,$s0,$s3,$s2);
 
555
                &decstep(2,"ebp",$s2,$s1,$s0,$s3);
 
556
                &decstep(3,"ebp",$s3,$s2,$s1,$s0);
 
557
                &add    ($key,16);              # advance rd_key
 
558
                &xor    ($s0,&DWP(0,$key));
 
559
                &xor    ($s1,&DWP(4,$key));
 
560
                &xor    ($s2,&DWP(8,$key));
 
561
                &xor    ($s3,&DWP(12,$key));
 
562
            &cmp        ($key,&DWP(16,"esp"));
 
563
            &mov        (&DWP(12,"esp"),$key);
 
564
            &jb         (&label("loop"));
 
565
        }
 
566
        else {
 
567
            &cmp        ($acc,10);
 
568
            &jle        (&label("10rounds"));
 
569
            &cmp        ($acc,12);
 
570
            &jle        (&label("12rounds"));
 
571
 
 
572
        &set_label("14rounds");
 
573
            for ($i=1;$i<3;$i++) {
 
574
                &decstep(0,"ebp",$s0,$s3,$s2,$s1);
 
575
                &decstep(1,"ebp",$s1,$s0,$s3,$s2);
 
576
                &decstep(2,"ebp",$s2,$s1,$s0,$s3);
 
577
                &decstep(3,"ebp",$s3,$s2,$s1,$s0);
 
578
                &xor    ($s0,&DWP(16*$i+0,$key));
 
579
                &xor    ($s1,&DWP(16*$i+4,$key));
 
580
                &xor    ($s2,&DWP(16*$i+8,$key));
 
581
                &xor    ($s3,&DWP(16*$i+12,$key));
 
582
            }
 
583
            &add        ($key,32);
 
584
            &mov        (&DWP(12,"esp"),$key);  # advance rd_key
 
585
        &set_label("12rounds");
 
586
            for ($i=1;$i<3;$i++) {
 
587
                &decstep(0,"ebp",$s0,$s3,$s2,$s1);
 
588
                &decstep(1,"ebp",$s1,$s0,$s3,$s2);
 
589
                &decstep(2,"ebp",$s2,$s1,$s0,$s3);
 
590
                &decstep(3,"ebp",$s3,$s2,$s1,$s0);
 
591
                &xor    ($s0,&DWP(16*$i+0,$key));
 
592
                &xor    ($s1,&DWP(16*$i+4,$key));
 
593
                &xor    ($s2,&DWP(16*$i+8,$key));
 
594
                &xor    ($s3,&DWP(16*$i+12,$key));
 
595
            }
 
596
            &add        ($key,32);
 
597
            &mov        (&DWP(12,"esp"),$key);  # advance rd_key
 
598
        &set_label("10rounds");
 
599
            for ($i=1;$i<10;$i++) {
 
600
                &decstep(0,"ebp",$s0,$s3,$s2,$s1);
 
601
                &decstep(1,"ebp",$s1,$s0,$s3,$s2);
 
602
                &decstep(2,"ebp",$s2,$s1,$s0,$s3);
 
603
                &decstep(3,"ebp",$s3,$s2,$s1,$s0);
 
604
                &xor    ($s0,&DWP(16*$i+0,$key));
 
605
                &xor    ($s1,&DWP(16*$i+4,$key));
 
606
                &xor    ($s2,&DWP(16*$i+8,$key));
 
607
                &xor    ($s3,&DWP(16*$i+12,$key));
 
608
            }
 
609
        }
 
610
 
 
611
        &declast(0,"ebp",$s0,$s3,$s2,$s1);
 
612
        &declast(1,"ebp",$s1,$s0,$s3,$s2);
 
613
        &declast(2,"ebp",$s2,$s1,$s0,$s3);
 
614
        &declast(3,"ebp",$s3,$s2,$s1,$s0);
 
615
 
 
616
        &add    ($key,$small_footprint?16:160);
 
617
        &xor    ($s0,&DWP(0,$key));
 
618
        &xor    ($s1,&DWP(4,$key));
 
619
        &xor    ($s2,&DWP(8,$key));
 
620
        &xor    ($s3,&DWP(12,$key));
 
621
 
 
622
        &ret    ();
 
623
 
 
624
&set_label("AES_Td",64);        # Yes! I keep it in the code segment!
 
625
        &_data_word(0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a);
 
626
        &_data_word(0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b);
 
627
        &_data_word(0x55fa3020, 0xf66d76ad, 0x9176cc88, 0x254c02f5);
 
628
        &_data_word(0xfcd7e54f, 0xd7cb2ac5, 0x80443526, 0x8fa362b5);
 
629
        &_data_word(0x495ab1de, 0x671bba25, 0x980eea45, 0xe1c0fe5d);
 
630
        &_data_word(0x02752fc3, 0x12f04c81, 0xa397468d, 0xc6f9d36b);
 
631
        &_data_word(0xe75f8f03, 0x959c9215, 0xeb7a6dbf, 0xda595295);
 
632
        &_data_word(0x2d83bed4, 0xd3217458, 0x2969e049, 0x44c8c98e);
 
633
        &_data_word(0x6a89c275, 0x78798ef4, 0x6b3e5899, 0xdd71b927);
 
634
        &_data_word(0xb64fe1be, 0x17ad88f0, 0x66ac20c9, 0xb43ace7d);
 
635
        &_data_word(0x184adf63, 0x82311ae5, 0x60335197, 0x457f5362);
 
636
        &_data_word(0xe07764b1, 0x84ae6bbb, 0x1ca081fe, 0x942b08f9);
 
637
        &_data_word(0x58684870, 0x19fd458f, 0x876cde94, 0xb7f87b52);
 
638
        &_data_word(0x23d373ab, 0xe2024b72, 0x578f1fe3, 0x2aab5566);
 
639
        &_data_word(0x0728ebb2, 0x03c2b52f, 0x9a7bc586, 0xa50837d3);
 
640
        &_data_word(0xf2872830, 0xb2a5bf23, 0xba6a0302, 0x5c8216ed);
 
641
        &_data_word(0x2b1ccf8a, 0x92b479a7, 0xf0f207f3, 0xa1e2694e);
 
642
        &_data_word(0xcdf4da65, 0xd5be0506, 0x1f6234d1, 0x8afea6c4);
 
643
        &_data_word(0x9d532e34, 0xa055f3a2, 0x32e18a05, 0x75ebf6a4);
 
644
        &_data_word(0x39ec830b, 0xaaef6040, 0x069f715e, 0x51106ebd);
 
645
        &_data_word(0xf98a213e, 0x3d06dd96, 0xae053edd, 0x46bde64d);
 
646
        &_data_word(0xb58d5491, 0x055dc471, 0x6fd40604, 0xff155060);
 
647
        &_data_word(0x24fb9819, 0x97e9bdd6, 0xcc434089, 0x779ed967);
 
648
        &_data_word(0xbd42e8b0, 0x888b8907, 0x385b19e7, 0xdbeec879);
 
649
        &_data_word(0x470a7ca1, 0xe90f427c, 0xc91e84f8, 0x00000000);
 
650
        &_data_word(0x83868009, 0x48ed2b32, 0xac70111e, 0x4e725a6c);
 
651
        &_data_word(0xfbff0efd, 0x5638850f, 0x1ed5ae3d, 0x27392d36);
 
652
        &_data_word(0x64d90f0a, 0x21a65c68, 0xd1545b9b, 0x3a2e3624);
 
653
        &_data_word(0xb1670a0c, 0x0fe75793, 0xd296eeb4, 0x9e919b1b);
 
654
        &_data_word(0x4fc5c080, 0xa220dc61, 0x694b775a, 0x161a121c);
 
655
        &_data_word(0x0aba93e2, 0xe52aa0c0, 0x43e0223c, 0x1d171b12);
 
656
        &_data_word(0x0b0d090e, 0xadc78bf2, 0xb9a8b62d, 0xc8a91e14);
 
657
        &_data_word(0x8519f157, 0x4c0775af, 0xbbdd99ee, 0xfd607fa3);
 
658
        &_data_word(0x9f2601f7, 0xbcf5725c, 0xc53b6644, 0x347efb5b);
 
659
        &_data_word(0x7629438b, 0xdcc623cb, 0x68fcedb6, 0x63f1e4b8);
 
660
        &_data_word(0xcadc31d7, 0x10856342, 0x40229713, 0x2011c684);
 
661
        &_data_word(0x7d244a85, 0xf83dbbd2, 0x1132f9ae, 0x6da129c7);
 
662
        &_data_word(0x4b2f9e1d, 0xf330b2dc, 0xec52860d, 0xd0e3c177);
 
663
        &_data_word(0x6c16b32b, 0x99b970a9, 0xfa489411, 0x2264e947);
 
664
        &_data_word(0xc48cfca8, 0x1a3ff0a0, 0xd82c7d56, 0xef903322);
 
665
        &_data_word(0xc74e4987, 0xc1d138d9, 0xfea2ca8c, 0x360bd498);
 
666
        &_data_word(0xcf81f5a6, 0x28de7aa5, 0x268eb7da, 0xa4bfad3f);
 
667
        &_data_word(0xe49d3a2c, 0x0d927850, 0x9bcc5f6a, 0x62467e54);
 
668
        &_data_word(0xc2138df6, 0xe8b8d890, 0x5ef7392e, 0xf5afc382);
 
669
        &_data_word(0xbe805d9f, 0x7c93d069, 0xa92dd56f, 0xb31225cf);
 
670
        &_data_word(0x3b99acc8, 0xa77d1810, 0x6e639ce8, 0x7bbb3bdb);
 
671
        &_data_word(0x097826cd, 0xf418596e, 0x01b79aec, 0xa89a4f83);
 
672
        &_data_word(0x656e95e6, 0x7ee6ffaa, 0x08cfbc21, 0xe6e815ef);
 
673
        &_data_word(0xd99be7ba, 0xce366f4a, 0xd4099fea, 0xd67cb029);
 
674
        &_data_word(0xafb2a431, 0x31233f2a, 0x3094a5c6, 0xc066a235);
 
675
        &_data_word(0x37bc4e74, 0xa6ca82fc, 0xb0d090e0, 0x15d8a733);
 
676
        &_data_word(0x4a9804f1, 0xf7daec41, 0x0e50cd7f, 0x2ff69117);
 
677
        &_data_word(0x8dd64d76, 0x4db0ef43, 0x544daacc, 0xdf0496e4);
 
678
        &_data_word(0xe3b5d19e, 0x1b886a4c, 0xb81f2cc1, 0x7f516546);
 
679
        &_data_word(0x04ea5e9d, 0x5d358c01, 0x737487fa, 0x2e410bfb);
 
680
        &_data_word(0x5a1d67b3, 0x52d2db92, 0x335610e9, 0x1347d66d);
 
681
        &_data_word(0x8c61d79a, 0x7a0ca137, 0x8e14f859, 0x893c13eb);
 
682
        &_data_word(0xee27a9ce, 0x35c961b7, 0xede51ce1, 0x3cb1477a);
 
683
        &_data_word(0x59dfd29c, 0x3f73f255, 0x79ce1418, 0xbf37c773);
 
684
        &_data_word(0xeacdf753, 0x5baafd5f, 0x146f3ddf, 0x86db4478);
 
685
        &_data_word(0x81f3afca, 0x3ec468b9, 0x2c342438, 0x5f40a3c2);
 
686
        &_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff);
 
687
        &_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664);
 
688
        &_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0);
 
689
#Td4:
 
690
        &data_word(0x52525252, 0x09090909, 0x6a6a6a6a, 0xd5d5d5d5);
 
691
        &data_word(0x30303030, 0x36363636, 0xa5a5a5a5, 0x38383838);
 
692
        &data_word(0xbfbfbfbf, 0x40404040, 0xa3a3a3a3, 0x9e9e9e9e);
 
693
        &data_word(0x81818181, 0xf3f3f3f3, 0xd7d7d7d7, 0xfbfbfbfb);
 
694
        &data_word(0x7c7c7c7c, 0xe3e3e3e3, 0x39393939, 0x82828282);
 
695
        &data_word(0x9b9b9b9b, 0x2f2f2f2f, 0xffffffff, 0x87878787);
 
696
        &data_word(0x34343434, 0x8e8e8e8e, 0x43434343, 0x44444444);
 
697
        &data_word(0xc4c4c4c4, 0xdededede, 0xe9e9e9e9, 0xcbcbcbcb);
 
698
        &data_word(0x54545454, 0x7b7b7b7b, 0x94949494, 0x32323232);
 
699
        &data_word(0xa6a6a6a6, 0xc2c2c2c2, 0x23232323, 0x3d3d3d3d);
 
700
        &data_word(0xeeeeeeee, 0x4c4c4c4c, 0x95959595, 0x0b0b0b0b);
 
701
        &data_word(0x42424242, 0xfafafafa, 0xc3c3c3c3, 0x4e4e4e4e);
 
702
        &data_word(0x08080808, 0x2e2e2e2e, 0xa1a1a1a1, 0x66666666);
 
703
        &data_word(0x28282828, 0xd9d9d9d9, 0x24242424, 0xb2b2b2b2);
 
704
        &data_word(0x76767676, 0x5b5b5b5b, 0xa2a2a2a2, 0x49494949);
 
705
        &data_word(0x6d6d6d6d, 0x8b8b8b8b, 0xd1d1d1d1, 0x25252525);
 
706
        &data_word(0x72727272, 0xf8f8f8f8, 0xf6f6f6f6, 0x64646464);
 
707
        &data_word(0x86868686, 0x68686868, 0x98989898, 0x16161616);
 
708
        &data_word(0xd4d4d4d4, 0xa4a4a4a4, 0x5c5c5c5c, 0xcccccccc);
 
709
        &data_word(0x5d5d5d5d, 0x65656565, 0xb6b6b6b6, 0x92929292);
 
710
        &data_word(0x6c6c6c6c, 0x70707070, 0x48484848, 0x50505050);
 
711
        &data_word(0xfdfdfdfd, 0xedededed, 0xb9b9b9b9, 0xdadadada);
 
712
        &data_word(0x5e5e5e5e, 0x15151515, 0x46464646, 0x57575757);
 
713
        &data_word(0xa7a7a7a7, 0x8d8d8d8d, 0x9d9d9d9d, 0x84848484);
 
714
        &data_word(0x90909090, 0xd8d8d8d8, 0xabababab, 0x00000000);
 
715
        &data_word(0x8c8c8c8c, 0xbcbcbcbc, 0xd3d3d3d3, 0x0a0a0a0a);
 
716
        &data_word(0xf7f7f7f7, 0xe4e4e4e4, 0x58585858, 0x05050505);
 
717
        &data_word(0xb8b8b8b8, 0xb3b3b3b3, 0x45454545, 0x06060606);
 
718
        &data_word(0xd0d0d0d0, 0x2c2c2c2c, 0x1e1e1e1e, 0x8f8f8f8f);
 
719
        &data_word(0xcacacaca, 0x3f3f3f3f, 0x0f0f0f0f, 0x02020202);
 
720
        &data_word(0xc1c1c1c1, 0xafafafaf, 0xbdbdbdbd, 0x03030303);
 
721
        &data_word(0x01010101, 0x13131313, 0x8a8a8a8a, 0x6b6b6b6b);
 
722
        &data_word(0x3a3a3a3a, 0x91919191, 0x11111111, 0x41414141);
 
723
        &data_word(0x4f4f4f4f, 0x67676767, 0xdcdcdcdc, 0xeaeaeaea);
 
724
        &data_word(0x97979797, 0xf2f2f2f2, 0xcfcfcfcf, 0xcececece);
 
725
        &data_word(0xf0f0f0f0, 0xb4b4b4b4, 0xe6e6e6e6, 0x73737373);
 
726
        &data_word(0x96969696, 0xacacacac, 0x74747474, 0x22222222);
 
727
        &data_word(0xe7e7e7e7, 0xadadadad, 0x35353535, 0x85858585);
 
728
        &data_word(0xe2e2e2e2, 0xf9f9f9f9, 0x37373737, 0xe8e8e8e8);
 
729
        &data_word(0x1c1c1c1c, 0x75757575, 0xdfdfdfdf, 0x6e6e6e6e);
 
730
        &data_word(0x47474747, 0xf1f1f1f1, 0x1a1a1a1a, 0x71717171);
 
731
        &data_word(0x1d1d1d1d, 0x29292929, 0xc5c5c5c5, 0x89898989);
 
732
        &data_word(0x6f6f6f6f, 0xb7b7b7b7, 0x62626262, 0x0e0e0e0e);
 
733
        &data_word(0xaaaaaaaa, 0x18181818, 0xbebebebe, 0x1b1b1b1b);
 
734
        &data_word(0xfcfcfcfc, 0x56565656, 0x3e3e3e3e, 0x4b4b4b4b);
 
735
        &data_word(0xc6c6c6c6, 0xd2d2d2d2, 0x79797979, 0x20202020);
 
736
        &data_word(0x9a9a9a9a, 0xdbdbdbdb, 0xc0c0c0c0, 0xfefefefe);
 
737
        &data_word(0x78787878, 0xcdcdcdcd, 0x5a5a5a5a, 0xf4f4f4f4);
 
738
        &data_word(0x1f1f1f1f, 0xdddddddd, 0xa8a8a8a8, 0x33333333);
 
739
        &data_word(0x88888888, 0x07070707, 0xc7c7c7c7, 0x31313131);
 
740
        &data_word(0xb1b1b1b1, 0x12121212, 0x10101010, 0x59595959);
 
741
        &data_word(0x27272727, 0x80808080, 0xecececec, 0x5f5f5f5f);
 
742
        &data_word(0x60606060, 0x51515151, 0x7f7f7f7f, 0xa9a9a9a9);
 
743
        &data_word(0x19191919, 0xb5b5b5b5, 0x4a4a4a4a, 0x0d0d0d0d);
 
744
        &data_word(0x2d2d2d2d, 0xe5e5e5e5, 0x7a7a7a7a, 0x9f9f9f9f);
 
745
        &data_word(0x93939393, 0xc9c9c9c9, 0x9c9c9c9c, 0xefefefef);
 
746
        &data_word(0xa0a0a0a0, 0xe0e0e0e0, 0x3b3b3b3b, 0x4d4d4d4d);
 
747
        &data_word(0xaeaeaeae, 0x2a2a2a2a, 0xf5f5f5f5, 0xb0b0b0b0);
 
748
        &data_word(0xc8c8c8c8, 0xebebebeb, 0xbbbbbbbb, 0x3c3c3c3c);
 
749
        &data_word(0x83838383, 0x53535353, 0x99999999, 0x61616161);
 
750
        &data_word(0x17171717, 0x2b2b2b2b, 0x04040404, 0x7e7e7e7e);
 
751
        &data_word(0xbabababa, 0x77777777, 0xd6d6d6d6, 0x26262626);
 
752
        &data_word(0xe1e1e1e1, 0x69696969, 0x14141414, 0x63636363);
 
753
        &data_word(0x55555555, 0x21212121, 0x0c0c0c0c, 0x7d7d7d7d);
 
754
&function_end_B("_x86_AES_decrypt");
 
755
 
 
756
# void AES_decrypt (const void *inp,void *out,const AES_KEY *key);
 
757
&public_label("AES_Td");
 
758
&function_begin("AES_decrypt");
 
759
        &mov    ($acc,&wparam(0));              # load inp
 
760
        &mov    ($key,&wparam(2));              # load key
 
761
 
 
762
        &mov    ($s0,"esp");
 
763
        &sub    ("esp",24);
 
764
        &and    ("esp",-64);
 
765
        &add    ("esp",4);
 
766
        &mov    (&DWP(16,"esp"),$s0);
 
767
 
 
768
        &call   (&label("pic_point"));          # make it PIC!
 
769
        &set_label("pic_point");
 
770
        &blindpop("ebp");
 
771
        &lea    ("ebp",&DWP(&label("AES_Td")."-".&label("pic_point"),"ebp"));
 
772
 
 
773
        &mov    ($s0,&DWP(0,$acc));             # load input data
 
774
        &mov    ($s1,&DWP(4,$acc));
 
775
        &mov    ($s2,&DWP(8,$acc));
 
776
        &mov    ($s3,&DWP(12,$acc));
 
777
 
 
778
        &call   ("_x86_AES_decrypt");
 
779
 
 
780
        &mov    ("esp",&DWP(16,"esp"));
 
781
 
 
782
        &mov    ($acc,&wparam(1));              # load out
 
783
        &mov    (&DWP(0,$acc),$s0);             # write output data
 
784
        &mov    (&DWP(4,$acc),$s1);
 
785
        &mov    (&DWP(8,$acc),$s2);
 
786
        &mov    (&DWP(12,$acc),$s3);
 
787
&function_end("AES_decrypt");
 
788
 
 
789
# void AES_cbc_encrypt (const void char *inp, unsigned char *out,
 
790
#                       size_t length, const AES_KEY *key,
 
791
#                       unsigned char *ivp,const int enc);
 
792
{
 
793
# stack frame layout
 
794
# -4(%esp)      0(%esp)         return address
 
795
# 0(%esp)       4(%esp)         tmp1
 
796
# 4(%esp)       8(%esp)         tmp2
 
797
# 8(%esp)       12(%esp)        key
 
798
# 12(%esp)      16(%esp)        end of key schedule
 
799
my $_esp=&DWP(16,"esp");        #saved %esp
 
800
my $_inp=&DWP(20,"esp");        #copy of wparam(0)
 
801
my $_out=&DWP(24,"esp");        #copy of wparam(1)
 
802
my $_len=&DWP(28,"esp");        #copy of wparam(2)
 
803
my $_key=&DWP(32,"esp");        #copy of wparam(3)
 
804
my $_ivp=&DWP(36,"esp");        #copy of wparam(4)
 
805
my $_tmp=&DWP(40,"esp");        #volatile variable
 
806
my $ivec=&DWP(44,"esp");        #ivec[16]
 
807
my $aes_key=&DWP(60,"esp");     #copy of aes_key
 
808
 
 
809
&public_label("AES_Te");
 
810
&public_label("AES_Td");
 
811
&function_begin("AES_cbc_encrypt");
 
812
        &mov    ($s2 eq "ecx"? $s2 : "",&wparam(2));    # load len
 
813
        &cmp    ($s2,0);
 
814
        &je     (&label("enc_out"));
 
815
 
 
816
        &call   (&label("pic_point"));          # make it PIC!
 
817
        &set_label("pic_point");
 
818
        &blindpop("ebp");
 
819
 
 
820
        &pushf  ();
 
821
        &cld    ();
 
822
 
 
823
        &cmp    (&wparam(5),0);
 
824
        &je     (&label("DECRYPT"));
 
825
 
 
826
        &lea    ("ebp",&DWP(&label("AES_Te")."-".&label("pic_point"),"ebp"));
 
827
 
 
828
        # allocate aligned stack frame...
 
829
        &lea    ($key,&DWP(-64-244,"esp"));
 
830
        &and    ($key,-64);
 
831
 
 
832
        # ... and make sure it doesn't alias with AES_Te modulo 4096
 
833
        &mov    ($s0,"ebp");
 
834
        &lea    ($s1,&DWP(2048,"ebp"));
 
835
        &mov    ($s3,$key);
 
836
        &and    ($s0,0xfff);            # s = %ebp&0xfff
 
837
        &and    ($s1,0xfff);            # e = (%ebp+2048)&0xfff
 
838
        &and    ($s3,0xfff);            # p = %esp&0xfff
 
839
 
 
840
        &cmp    ($s3,$s1);              # if (p>=e) %esp =- (p-e);
 
841
        &jb     (&label("te_break_out"));
 
842
        &sub    ($s3,$s1);
 
843
        &sub    ($key,$s3);
 
844
        &jmp    (&label("te_ok"));
 
845
        &set_label("te_break_out");     # else %esp -= (p-s)&0xfff + framesz;
 
846
        &sub    ($s3,$s0);
 
847
        &and    ($s3,0xfff);
 
848
        &add    ($s3,64+256);
 
849
        &sub    ($key,$s3);
 
850
        &align  (4);
 
851
        &set_label("te_ok");
 
852
 
 
853
        &mov    ($s0,&wparam(0));       # load inp
 
854
        &mov    ($s1,&wparam(1));       # load out
 
855
        &mov    ($s3,&wparam(3));       # load key
 
856
        &mov    ($acc,&wparam(4));      # load ivp
 
857
 
 
858
        &exch   ("esp",$key);
 
859
        &add    ("esp",4);              # reserve for return address!
 
860
        &mov    ($_esp,$key);           # save %esp
 
861
 
 
862
        &mov    ($_inp,$s0);            # save copy of inp
 
863
        &mov    ($_out,$s1);            # save copy of out
 
864
        &mov    ($_len,$s2);            # save copy of len
 
865
        &mov    ($_key,$s3);            # save copy of key
 
866
        &mov    ($_ivp,$acc);           # save copy of ivp
 
867
 
 
868
        if ($compromise) {
 
869
                &cmp    ($s2,$compromise);
 
870
                &jb     (&label("skip_ecopy"));
 
871
        }
 
872
        # copy key schedule to stack
 
873
        &mov    ("ecx",244/4);
 
874
        &mov    ("esi",$s3);
 
875
        &lea    ("edi",$aes_key);
 
876
        &mov    ($_key,"edi");
 
877
        &align  (4);
 
878
        &data_word(0xF689A5F3); # rep movsd
 
879
        &set_label("skip_ecopy") if ($compromise);
 
880
 
 
881
        &mov    ($acc,$s0);
 
882
        &mov    ($key,16);
 
883
        &align  (4);
 
884
        &set_label("prefetch_te");
 
885
                &mov    ($s0,&DWP(0,"ebp"));
 
886
                &mov    ($s1,&DWP(32,"ebp"));
 
887
                &mov    ($s2,&DWP(64,"ebp"));
 
888
                &mov    ($s3,&DWP(96,"ebp"));
 
889
                &lea    ("ebp",&DWP(128,"ebp"));
 
890
                &dec    ($key);
 
891
        &jnz    (&label("prefetch_te"));
 
892
        &sub    ("ebp",2048);
 
893
 
 
894
        &mov    ($s2,$_len);
 
895
        &mov    ($key,$_ivp);
 
896
        &test   ($s2,0xFFFFFFF0);
 
897
        &jz     (&label("enc_tail"));           # short input...
 
898
 
 
899
        &mov    ($s0,&DWP(0,$key));             # load iv
 
900
        &mov    ($s1,&DWP(4,$key));
 
901
 
 
902
        &align  (4);
 
903
        &set_label("enc_loop");
 
904
                &mov    ($s2,&DWP(8,$key));
 
905
                &mov    ($s3,&DWP(12,$key));
 
906
 
 
907
                &xor    ($s0,&DWP(0,$acc));     # xor input data
 
908
                &xor    ($s1,&DWP(4,$acc));
 
909
                &xor    ($s2,&DWP(8,$acc));
 
910
                &xor    ($s3,&DWP(12,$acc));
 
911
 
 
912
                &mov    ($key,$_key);           # load key
 
913
                &call   ("_x86_AES_encrypt");
 
914
 
 
915
                &mov    ($acc,$_inp);           # load inp
 
916
                &mov    ($key,$_out);           # load out
 
917
 
 
918
                &mov    (&DWP(0,$key),$s0);     # save output data
 
919
                &mov    (&DWP(4,$key),$s1);
 
920
                &mov    (&DWP(8,$key),$s2);
 
921
                &mov    (&DWP(12,$key),$s3);
 
922
 
 
923
                &mov    ($s2,$_len);            # load len
 
924
 
 
925
                &lea    ($acc,&DWP(16,$acc));
 
926
                &mov    ($_inp,$acc);           # save inp
 
927
 
 
928
                &lea    ($s3,&DWP(16,$key));
 
929
                &mov    ($_out,$s3);            # save out
 
930
 
 
931
                &sub    ($s2,16);
 
932
                &test   ($s2,0xFFFFFFF0);
 
933
                &mov    ($_len,$s2);            # save len
 
934
        &jnz    (&label("enc_loop"));
 
935
        &test   ($s2,15);
 
936
        &jnz    (&label("enc_tail"));
 
937
        &mov    ($acc,$_ivp);           # load ivp
 
938
        &mov    ($s2,&DWP(8,$key));     # restore last dwords
 
939
        &mov    ($s3,&DWP(12,$key));
 
940
        &mov    (&DWP(0,$acc),$s0);     # save ivec
 
941
        &mov    (&DWP(4,$acc),$s1);
 
942
        &mov    (&DWP(8,$acc),$s2);
 
943
        &mov    (&DWP(12,$acc),$s3);
 
944
 
 
945
        &mov    ("edi",$_key);
 
946
        &mov    ("esp",$_esp);
 
947
        if ($compromise) {
 
948
                &cmp    (&wparam(2),$compromise);
 
949
                &jb     (&label("skip_ezero"));
 
950
        }
 
951
        # zero copy of key schedule
 
952
        &mov    ("ecx",240/4);
 
953
        &xor    ("eax","eax");
 
954
        &align  (4);
 
955
        &data_word(0xF689ABF3); # rep stosd
 
956
        &set_label("skip_ezero") if ($compromise);
 
957
        &popf   ();
 
958
    &set_label("enc_out");
 
959
        &function_end_A();
 
960
        &pushf  ();                     # kludge, never executed
 
961
 
 
962
    &align      (4);
 
963
    &set_label("enc_tail");
 
964
        &push   ($key eq "edi" ? $key : "");    # push ivp
 
965
        &mov    ($key,$_out);                   # load out
 
966
        &mov    ($s1,16);
 
967
        &sub    ($s1,$s2);
 
968
        &cmp    ($key,$acc);                    # compare with inp
 
969
        &je     (&label("enc_in_place"));
 
970
        &align  (4);
 
971
        &data_word(0xF689A4F3); # rep movsb     # copy input
 
972
        &jmp    (&label("enc_skip_in_place"));
 
973
    &set_label("enc_in_place");
 
974
        &lea    ($key,&DWP(0,$key,$s2));
 
975
    &set_label("enc_skip_in_place");
 
976
        &mov    ($s2,$s1);
 
977
        &xor    ($s0,$s0);
 
978
        &align  (4);
 
979
        &data_word(0xF689AAF3); # rep stosb     # zero tail
 
980
        &pop    ($key);                         # pop ivp
 
981
 
 
982
        &mov    ($acc,$_out);                   # output as input
 
983
        &mov    ($s0,&DWP(0,$key));
 
984
        &mov    ($s1,&DWP(4,$key));
 
985
        &mov    ($_len,16);                     # len=16
 
986
        &jmp    (&label("enc_loop"));           # one more spin...
 
987
 
 
988
#----------------------------- DECRYPT -----------------------------#
 
989
&align  (4);
 
990
&set_label("DECRYPT");
 
991
        &lea    ("ebp",&DWP(&label("AES_Td")."-".&label("pic_point"),"ebp"));
 
992
 
 
993
        # allocate aligned stack frame...
 
994
        &lea    ($key,&DWP(-64-244,"esp"));
 
995
        &and    ($key,-64);
 
996
 
 
997
        # ... and make sure it doesn't alias with AES_Td modulo 4096
 
998
        &mov    ($s0,"ebp");
 
999
        &lea    ($s1,&DWP(3072,"ebp"));
 
1000
        &mov    ($s3,$key);
 
1001
        &and    ($s0,0xfff);            # s = %ebp&0xfff
 
1002
        &and    ($s1,0xfff);            # e = (%ebp+3072)&0xfff
 
1003
        &and    ($s3,0xfff);            # p = %esp&0xfff
 
1004
 
 
1005
        &cmp    ($s3,$s1);              # if (p>=e) %esp =- (p-e);
 
1006
        &jb     (&label("td_break_out"));
 
1007
        &sub    ($s3,$s1);
 
1008
        &sub    ($key,$s3);
 
1009
        &jmp    (&label("td_ok"));
 
1010
        &set_label("td_break_out");     # else %esp -= (p-s)&0xfff + framesz;
 
1011
        &sub    ($s3,$s0);
 
1012
        &and    ($s3,0xfff);
 
1013
        &add    ($s3,64+256);
 
1014
        &sub    ($key,$s3);
 
1015
        &align  (4);
 
1016
        &set_label("td_ok");
 
1017
 
 
1018
        &mov    ($s0,&wparam(0));       # load inp
 
1019
        &mov    ($s1,&wparam(1));       # load out
 
1020
        &mov    ($s3,&wparam(3));       # load key
 
1021
        &mov    ($acc,&wparam(4));      # load ivp
 
1022
 
 
1023
        &exch   ("esp",$key);
 
1024
        &add    ("esp",4);              # reserve for return address!
 
1025
        &mov    ($_esp,$key);           # save %esp
 
1026
 
 
1027
        &mov    ($_inp,$s0);            # save copy of inp
 
1028
        &mov    ($_out,$s1);            # save copy of out
 
1029
        &mov    ($_len,$s2);            # save copy of len
 
1030
        &mov    ($_key,$s3);            # save copy of key
 
1031
        &mov    ($_ivp,$acc);           # save copy of ivp
 
1032
 
 
1033
        if ($compromise) {
 
1034
                &cmp    ($s2,$compromise);
 
1035
                &jb     (&label("skip_dcopy"));
 
1036
        }
 
1037
        # copy key schedule to stack
 
1038
        &mov    ("ecx",244/4);
 
1039
        &mov    ("esi",$s3);
 
1040
        &lea    ("edi",$aes_key);
 
1041
        &mov    ($_key,"edi");
 
1042
        &align  (4);
 
1043
        &data_word(0xF689A5F3); # rep movsd
 
1044
        &set_label("skip_dcopy") if ($compromise);
 
1045
 
 
1046
        &mov    ($acc,$s0);
 
1047
        &mov    ($key,24);
 
1048
        &align  (4);
 
1049
        &set_label("prefetch_td");
 
1050
                &mov    ($s0,&DWP(0,"ebp"));
 
1051
                &mov    ($s1,&DWP(32,"ebp"));
 
1052
                &mov    ($s2,&DWP(64,"ebp"));
 
1053
                &mov    ($s3,&DWP(96,"ebp"));
 
1054
                &lea    ("ebp",&DWP(128,"ebp"));
 
1055
                &dec    ($key);
 
1056
        &jnz    (&label("prefetch_td"));
 
1057
        &sub    ("ebp",3072);
 
1058
 
 
1059
        &cmp    ($acc,$_out);
 
1060
        &je     (&label("dec_in_place"));       # in-place processing...
 
1061
 
 
1062
        &mov    ($key,$_ivp);           # load ivp
 
1063
        &mov    ($_tmp,$key);
 
1064
 
 
1065
        &align  (4);
 
1066
        &set_label("dec_loop");
 
1067
                &mov    ($s0,&DWP(0,$acc));     # read input
 
1068
                &mov    ($s1,&DWP(4,$acc));
 
1069
                &mov    ($s2,&DWP(8,$acc));
 
1070
                &mov    ($s3,&DWP(12,$acc));
 
1071
 
 
1072
                &mov    ($key,$_key);           # load key
 
1073
                &call   ("_x86_AES_decrypt");
 
1074
 
 
1075
                &mov    ($key,$_tmp);           # load ivp
 
1076
                &mov    ($acc,$_len);           # load len
 
1077
                &xor    ($s0,&DWP(0,$key));     # xor iv
 
1078
                &xor    ($s1,&DWP(4,$key));
 
1079
                &xor    ($s2,&DWP(8,$key));
 
1080
                &xor    ($s3,&DWP(12,$key));
 
1081
 
 
1082
                &sub    ($acc,16);
 
1083
                &jc     (&label("dec_partial"));
 
1084
                &mov    ($_len,$acc);           # save len
 
1085
                &mov    ($acc,$_inp);           # load inp
 
1086
                &mov    ($key,$_out);           # load out
 
1087
 
 
1088
                &mov    (&DWP(0,$key),$s0);     # write output
 
1089
                &mov    (&DWP(4,$key),$s1);
 
1090
                &mov    (&DWP(8,$key),$s2);
 
1091
                &mov    (&DWP(12,$key),$s3);
 
1092
 
 
1093
                &mov    ($_tmp,$acc);           # save ivp
 
1094
                &lea    ($acc,&DWP(16,$acc));
 
1095
                &mov    ($_inp,$acc);           # save inp
 
1096
 
 
1097
                &lea    ($key,&DWP(16,$key));
 
1098
                &mov    ($_out,$key);           # save out
 
1099
 
 
1100
        &jnz    (&label("dec_loop"));
 
1101
        &mov    ($key,$_tmp);           # load temp ivp
 
1102
    &set_label("dec_end");
 
1103
        &mov    ($acc,$_ivp);           # load user ivp
 
1104
        &mov    ($s0,&DWP(0,$key));     # load iv
 
1105
        &mov    ($s1,&DWP(4,$key));
 
1106
        &mov    ($s2,&DWP(8,$key));
 
1107
        &mov    ($s3,&DWP(12,$key));
 
1108
        &mov    (&DWP(0,$acc),$s0);     # copy back to user
 
1109
        &mov    (&DWP(4,$acc),$s1);
 
1110
        &mov    (&DWP(8,$acc),$s2);
 
1111
        &mov    (&DWP(12,$acc),$s3);
 
1112
        &jmp    (&label("dec_out"));
 
1113
 
 
1114
    &align      (4);
 
1115
    &set_label("dec_partial");
 
1116
        &lea    ($key,$ivec);
 
1117
        &mov    (&DWP(0,$key),$s0);     # dump output to stack
 
1118
        &mov    (&DWP(4,$key),$s1);
 
1119
        &mov    (&DWP(8,$key),$s2);
 
1120
        &mov    (&DWP(12,$key),$s3);
 
1121
        &lea    ($s2 eq "ecx" ? $s2 : "",&DWP(16,$acc));
 
1122
        &mov    ($acc eq "esi" ? $acc : "",$key);
 
1123
        &mov    ($key eq "edi" ? $key : "",$_out);      # load out
 
1124
        &data_word(0xF689A4F3); # rep movsb             # copy output
 
1125
        &mov    ($key,$_inp);                           # use inp as temp ivp
 
1126
        &jmp    (&label("dec_end"));
 
1127
 
 
1128
    &align      (4);
 
1129
    &set_label("dec_in_place");
 
1130
        &set_label("dec_in_place_loop");
 
1131
                &lea    ($key,$ivec);
 
1132
                &mov    ($s0,&DWP(0,$acc));     # read input
 
1133
                &mov    ($s1,&DWP(4,$acc));
 
1134
                &mov    ($s2,&DWP(8,$acc));
 
1135
                &mov    ($s3,&DWP(12,$acc));
 
1136
 
 
1137
                &mov    (&DWP(0,$key),$s0);     # copy to temp
 
1138
                &mov    (&DWP(4,$key),$s1);
 
1139
                &mov    (&DWP(8,$key),$s2);
 
1140
                &mov    (&DWP(12,$key),$s3);
 
1141
 
 
1142
                &mov    ($key,$_key);           # load key
 
1143
                &call   ("_x86_AES_decrypt");
 
1144
 
 
1145
                &mov    ($key,$_ivp);           # load ivp
 
1146
                &mov    ($acc,$_out);           # load out
 
1147
                &xor    ($s0,&DWP(0,$key));     # xor iv
 
1148
                &xor    ($s1,&DWP(4,$key));
 
1149
                &xor    ($s2,&DWP(8,$key));
 
1150
                &xor    ($s3,&DWP(12,$key));
 
1151
 
 
1152
                &mov    (&DWP(0,$acc),$s0);     # write output
 
1153
                &mov    (&DWP(4,$acc),$s1);
 
1154
                &mov    (&DWP(8,$acc),$s2);
 
1155
                &mov    (&DWP(12,$acc),$s3);
 
1156
 
 
1157
                &lea    ($acc,&DWP(16,$acc));
 
1158
                &mov    ($_out,$acc);           # save out
 
1159
 
 
1160
                &lea    ($acc,$ivec);
 
1161
                &mov    ($s0,&DWP(0,$acc));     # read temp
 
1162
                &mov    ($s1,&DWP(4,$acc));
 
1163
                &mov    ($s2,&DWP(8,$acc));
 
1164
                &mov    ($s3,&DWP(12,$acc));
 
1165
 
 
1166
                &mov    (&DWP(0,$key),$s0);     # copy iv
 
1167
                &mov    (&DWP(4,$key),$s1);
 
1168
                &mov    (&DWP(8,$key),$s2);
 
1169
                &mov    (&DWP(12,$key),$s3);
 
1170
 
 
1171
                &mov    ($acc,$_inp);           # load inp
 
1172
 
 
1173
                &lea    ($acc,&DWP(16,$acc));
 
1174
                &mov    ($_inp,$acc);           # save inp
 
1175
 
 
1176
                &mov    ($s2,$_len);            # load len
 
1177
                &sub    ($s2,16);
 
1178
                &jc     (&label("dec_in_place_partial"));
 
1179
                &mov    ($_len,$s2);            # save len
 
1180
        &jnz    (&label("dec_in_place_loop"));
 
1181
        &jmp    (&label("dec_out"));
 
1182
 
 
1183
    &align      (4);
 
1184
    &set_label("dec_in_place_partial");
 
1185
        # one can argue if this is actually required...
 
1186
        &mov    ($key eq "edi" ? $key : "",$_out);
 
1187
        &lea    ($acc eq "esi" ? $acc : "",$ivec);
 
1188
        &lea    ($key,&DWP(0,$key,$s2));
 
1189
        &lea    ($acc,&DWP(16,$acc,$s2));
 
1190
        &neg    ($s2 eq "ecx" ? $s2 : "");
 
1191
        &data_word(0xF689A4F3); # rep movsb     # restore tail
 
1192
 
 
1193
    &align      (4);
 
1194
    &set_label("dec_out");
 
1195
    &mov        ("edi",$_key);
 
1196
    &mov        ("esp",$_esp);
 
1197
    if ($compromise) {
 
1198
        &cmp    (&wparam(2),$compromise);
 
1199
        &jb     (&label("skip_dzero"));
 
1200
    }
 
1201
    # zero copy of key schedule
 
1202
    &mov        ("ecx",240/4);
 
1203
    &xor        ("eax","eax");
 
1204
    &align      (4);
 
1205
    &data_word(0xF689ABF3);     # rep stosd
 
1206
    &set_label("skip_dzero") if ($compromise);
 
1207
    &popf       ();
 
1208
&function_end("AES_cbc_encrypt");
 
1209
}
 
1210
 
 
1211
#------------------------------------------------------------------#
 
1212
 
 
1213
sub enckey()
 
1214
{
 
1215
        &movz   ("esi",&LB("edx"));             # rk[i]>>0
 
1216
        &mov    ("ebx",&DWP(2,"ebp","esi",8));
 
1217
        &movz   ("esi",&HB("edx"));             # rk[i]>>8
 
1218
        &and    ("ebx",0xFF000000);
 
1219
        &xor    ("eax","ebx");
 
1220
 
 
1221
        &mov    ("ebx",&DWP(2,"ebp","esi",8));
 
1222
        &shr    ("edx",16);
 
1223
        &and    ("ebx",0x000000FF);
 
1224
        &movz   ("esi",&LB("edx"));             # rk[i]>>16
 
1225
        &xor    ("eax","ebx");
 
1226
 
 
1227
        &mov    ("ebx",&DWP(0,"ebp","esi",8));
 
1228
        &movz   ("esi",&HB("edx"));             # rk[i]>>24
 
1229
        &and    ("ebx",0x0000FF00);
 
1230
        &xor    ("eax","ebx");
 
1231
 
 
1232
        &mov    ("ebx",&DWP(0,"ebp","esi",8));
 
1233
        &and    ("ebx",0x00FF0000);
 
1234
        &xor    ("eax","ebx");
 
1235
 
 
1236
        &xor    ("eax",&DWP(2048,"ebp","ecx",4));       # rcon
 
1237
}
 
1238
 
 
1239
# int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
 
1240
#                        AES_KEY *key)
 
1241
&public_label("AES_Te");
 
1242
&function_begin("AES_set_encrypt_key");
 
1243
        &mov    ("esi",&wparam(0));             # user supplied key
 
1244
        &mov    ("edi",&wparam(2));             # private key schedule
 
1245
 
 
1246
        &test   ("esi",-1);
 
1247
        &jz     (&label("badpointer"));
 
1248
        &test   ("edi",-1);
 
1249
        &jz     (&label("badpointer"));
 
1250
 
 
1251
        &call   (&label("pic_point"));
 
1252
        &set_label("pic_point");
 
1253
        &blindpop("ebp");
 
1254
        &lea    ("ebp",&DWP(&label("AES_Te")."-".&label("pic_point"),"ebp"));
 
1255
 
 
1256
        &mov    ("ecx",&wparam(1));             # number of bits in key
 
1257
        &cmp    ("ecx",128);
 
1258
        &je     (&label("10rounds"));
 
1259
        &cmp    ("ecx",192);
 
1260
        &je     (&label("12rounds"));
 
1261
        &cmp    ("ecx",256);
 
1262
        &je     (&label("14rounds"));
 
1263
        &mov    ("eax",-2);                     # invalid number of bits
 
1264
        &jmp    (&label("exit"));
 
1265
 
 
1266
    &set_label("10rounds");
 
1267
        &mov    ("eax",&DWP(0,"esi"));          # copy first 4 dwords
 
1268
        &mov    ("ebx",&DWP(4,"esi"));
 
1269
        &mov    ("ecx",&DWP(8,"esi"));
 
1270
        &mov    ("edx",&DWP(12,"esi"));
 
1271
        &mov    (&DWP(0,"edi"),"eax");
 
1272
        &mov    (&DWP(4,"edi"),"ebx");
 
1273
        &mov    (&DWP(8,"edi"),"ecx");
 
1274
        &mov    (&DWP(12,"edi"),"edx");
 
1275
 
 
1276
        &xor    ("ecx","ecx");
 
1277
        &jmp    (&label("10shortcut"));
 
1278
 
 
1279
        &align  (4);
 
1280
        &set_label("10loop");
 
1281
                &mov    ("eax",&DWP(0,"edi"));          # rk[0]
 
1282
                &mov    ("edx",&DWP(12,"edi"));         # rk[3]
 
1283
        &set_label("10shortcut");
 
1284
                &enckey ();
 
1285
 
 
1286
                &mov    (&DWP(16,"edi"),"eax");         # rk[4]
 
1287
                &xor    ("eax",&DWP(4,"edi"));
 
1288
                &mov    (&DWP(20,"edi"),"eax");         # rk[5]
 
1289
                &xor    ("eax",&DWP(8,"edi"));
 
1290
                &mov    (&DWP(24,"edi"),"eax");         # rk[6]
 
1291
                &xor    ("eax",&DWP(12,"edi"));
 
1292
                &mov    (&DWP(28,"edi"),"eax");         # rk[7]
 
1293
                &inc    ("ecx");
 
1294
                &add    ("edi",16);
 
1295
                &cmp    ("ecx",10);
 
1296
        &jl     (&label("10loop"));
 
1297
 
 
1298
        &mov    (&DWP(80,"edi"),10);            # setup number of rounds
 
1299
        &xor    ("eax","eax");
 
1300
        &jmp    (&label("exit"));
 
1301
                
 
1302
    &set_label("12rounds");
 
1303
        &mov    ("eax",&DWP(0,"esi"));          # copy first 6 dwords
 
1304
        &mov    ("ebx",&DWP(4,"esi"));
 
1305
        &mov    ("ecx",&DWP(8,"esi"));
 
1306
        &mov    ("edx",&DWP(12,"esi"));
 
1307
        &mov    (&DWP(0,"edi"),"eax");
 
1308
        &mov    (&DWP(4,"edi"),"ebx");
 
1309
        &mov    (&DWP(8,"edi"),"ecx");
 
1310
        &mov    (&DWP(12,"edi"),"edx");
 
1311
        &mov    ("ecx",&DWP(16,"esi"));
 
1312
        &mov    ("edx",&DWP(20,"esi"));
 
1313
        &mov    (&DWP(16,"edi"),"ecx");
 
1314
        &mov    (&DWP(20,"edi"),"edx");
 
1315
 
 
1316
        &xor    ("ecx","ecx");
 
1317
        &jmp    (&label("12shortcut"));
 
1318
 
 
1319
        &align  (4);
 
1320
        &set_label("12loop");
 
1321
                &mov    ("eax",&DWP(0,"edi"));          # rk[0]
 
1322
                &mov    ("edx",&DWP(20,"edi"));         # rk[5]
 
1323
        &set_label("12shortcut");
 
1324
                &enckey ();
 
1325
 
 
1326
                &mov    (&DWP(24,"edi"),"eax");         # rk[6]
 
1327
                &xor    ("eax",&DWP(4,"edi"));
 
1328
                &mov    (&DWP(28,"edi"),"eax");         # rk[7]
 
1329
                &xor    ("eax",&DWP(8,"edi"));
 
1330
                &mov    (&DWP(32,"edi"),"eax");         # rk[8]
 
1331
                &xor    ("eax",&DWP(12,"edi"));
 
1332
                &mov    (&DWP(36,"edi"),"eax");         # rk[9]
 
1333
 
 
1334
                &cmp    ("ecx",7);
 
1335
                &je     (&label("12break"));
 
1336
                &inc    ("ecx");
 
1337
 
 
1338
                &xor    ("eax",&DWP(16,"edi"));
 
1339
                &mov    (&DWP(40,"edi"),"eax");         # rk[10]
 
1340
                &xor    ("eax",&DWP(20,"edi"));
 
1341
                &mov    (&DWP(44,"edi"),"eax");         # rk[11]
 
1342
 
 
1343
                &add    ("edi",24);
 
1344
        &jmp    (&label("12loop"));
 
1345
 
 
1346
        &set_label("12break");
 
1347
        &mov    (&DWP(72,"edi"),12);            # setup number of rounds
 
1348
        &xor    ("eax","eax");
 
1349
        &jmp    (&label("exit"));
 
1350
 
 
1351
    &set_label("14rounds");
 
1352
        &mov    ("eax",&DWP(0,"esi"));          # copy first 8 dwords
 
1353
        &mov    ("ebx",&DWP(4,"esi"));
 
1354
        &mov    ("ecx",&DWP(8,"esi"));
 
1355
        &mov    ("edx",&DWP(12,"esi"));
 
1356
        &mov    (&DWP(0,"edi"),"eax");
 
1357
        &mov    (&DWP(4,"edi"),"ebx");
 
1358
        &mov    (&DWP(8,"edi"),"ecx");
 
1359
        &mov    (&DWP(12,"edi"),"edx");
 
1360
        &mov    ("eax",&DWP(16,"esi"));
 
1361
        &mov    ("ebx",&DWP(20,"esi"));
 
1362
        &mov    ("ecx",&DWP(24,"esi"));
 
1363
        &mov    ("edx",&DWP(28,"esi"));
 
1364
        &mov    (&DWP(16,"edi"),"eax");
 
1365
        &mov    (&DWP(20,"edi"),"ebx");
 
1366
        &mov    (&DWP(24,"edi"),"ecx");
 
1367
        &mov    (&DWP(28,"edi"),"edx");
 
1368
 
 
1369
        &xor    ("ecx","ecx");
 
1370
        &jmp    (&label("14shortcut"));
 
1371
 
 
1372
        &align  (4);
 
1373
        &set_label("14loop");
 
1374
                &mov    ("edx",&DWP(28,"edi"));         # rk[7]
 
1375
        &set_label("14shortcut");
 
1376
                &mov    ("eax",&DWP(0,"edi"));          # rk[0]
 
1377
 
 
1378
                &enckey ();
 
1379
 
 
1380
                &mov    (&DWP(32,"edi"),"eax");         # rk[8]
 
1381
                &xor    ("eax",&DWP(4,"edi"));
 
1382
                &mov    (&DWP(36,"edi"),"eax");         # rk[9]
 
1383
                &xor    ("eax",&DWP(8,"edi"));
 
1384
                &mov    (&DWP(40,"edi"),"eax");         # rk[10]
 
1385
                &xor    ("eax",&DWP(12,"edi"));
 
1386
                &mov    (&DWP(44,"edi"),"eax");         # rk[11]
 
1387
 
 
1388
                &cmp    ("ecx",6);
 
1389
                &je     (&label("14break"));
 
1390
                &inc    ("ecx");
 
1391
 
 
1392
                &mov    ("edx","eax");
 
1393
                &mov    ("eax",&DWP(16,"edi"));         # rk[4]
 
1394
                &movz   ("esi",&LB("edx"));             # rk[11]>>0
 
1395
                &mov    ("ebx",&DWP(2,"ebp","esi",8));
 
1396
                &movz   ("esi",&HB("edx"));             # rk[11]>>8
 
1397
                &and    ("ebx",0x000000FF);
 
1398
                &xor    ("eax","ebx");
 
1399
 
 
1400
                &mov    ("ebx",&DWP(0,"ebp","esi",8));
 
1401
                &shr    ("edx",16);
 
1402
                &and    ("ebx",0x0000FF00);
 
1403
                &movz   ("esi",&LB("edx"));             # rk[11]>>16
 
1404
                &xor    ("eax","ebx");
 
1405
 
 
1406
                &mov    ("ebx",&DWP(0,"ebp","esi",8));
 
1407
                &movz   ("esi",&HB("edx"));             # rk[11]>>24
 
1408
                &and    ("ebx",0x00FF0000);
 
1409
                &xor    ("eax","ebx");
 
1410
 
 
1411
                &mov    ("ebx",&DWP(2,"ebp","esi",8));
 
1412
                &and    ("ebx",0xFF000000);
 
1413
                &xor    ("eax","ebx");
 
1414
 
 
1415
                &mov    (&DWP(48,"edi"),"eax");         # rk[12]
 
1416
                &xor    ("eax",&DWP(20,"edi"));
 
1417
                &mov    (&DWP(52,"edi"),"eax");         # rk[13]
 
1418
                &xor    ("eax",&DWP(24,"edi"));
 
1419
                &mov    (&DWP(56,"edi"),"eax");         # rk[14]
 
1420
                &xor    ("eax",&DWP(28,"edi"));
 
1421
                &mov    (&DWP(60,"edi"),"eax");         # rk[15]
 
1422
 
 
1423
                &add    ("edi",32);
 
1424
        &jmp    (&label("14loop"));
 
1425
 
 
1426
        &set_label("14break");
 
1427
        &mov    (&DWP(48,"edi"),14);            # setup number of rounds
 
1428
        &xor    ("eax","eax");
 
1429
        &jmp    (&label("exit"));
 
1430
 
 
1431
    &set_label("badpointer");
 
1432
        &mov    ("eax",-1);
 
1433
    &set_label("exit");
 
1434
&function_end("AES_set_encrypt_key");
 
1435
 
 
1436
sub deckey()
 
1437
{ my ($i,$ptr,$te,$td) = @_;
 
1438
 
 
1439
        &mov    ("eax",&DWP($i,$ptr));
 
1440
        &mov    ("edx","eax");
 
1441
        &movz   ("ebx",&HB("eax"));
 
1442
        &shr    ("edx",16);
 
1443
        &and    ("eax",0xFF);
 
1444
        &movz   ("eax",&BP(2,$te,"eax",8));
 
1445
        &movz   ("ebx",&BP(2,$te,"ebx",8));
 
1446
        &mov    ("eax",&DWP(0,$td,"eax",8));
 
1447
        &xor    ("eax",&DWP(3,$td,"ebx",8));
 
1448
        &movz   ("ebx",&HB("edx"));
 
1449
        &and    ("edx",0xFF);
 
1450
        &movz   ("edx",&BP(2,$te,"edx",8));
 
1451
        &movz   ("ebx",&BP(2,$te,"ebx",8));
 
1452
        &xor    ("eax",&DWP(2,$td,"edx",8));
 
1453
        &xor    ("eax",&DWP(1,$td,"ebx",8));
 
1454
        &mov    (&DWP($i,$ptr),"eax");
 
1455
}
 
1456
 
 
1457
# int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
 
1458
#                        AES_KEY *key)
 
1459
&public_label("AES_Td");
 
1460
&public_label("AES_Te");
 
1461
&function_begin_B("AES_set_decrypt_key");
 
1462
        &mov    ("eax",&wparam(0));
 
1463
        &mov    ("ecx",&wparam(1));
 
1464
        &mov    ("edx",&wparam(2));
 
1465
        &sub    ("esp",12);
 
1466
        &mov    (&DWP(0,"esp"),"eax");
 
1467
        &mov    (&DWP(4,"esp"),"ecx");
 
1468
        &mov    (&DWP(8,"esp"),"edx");
 
1469
        &call   ("AES_set_encrypt_key");
 
1470
        &add    ("esp",12);
 
1471
        &cmp    ("eax",0);
 
1472
        &je     (&label("proceed"));
 
1473
        &ret    ();
 
1474
 
 
1475
    &set_label("proceed");
 
1476
        &push   ("ebp");
 
1477
        &push   ("ebx");
 
1478
        &push   ("esi");
 
1479
        &push   ("edi");
 
1480
 
 
1481
        &mov    ("esi",&wparam(2));
 
1482
        &mov    ("ecx",&DWP(240,"esi"));        # pull number of rounds
 
1483
        &lea    ("ecx",&DWP(0,"","ecx",4));
 
1484
        &lea    ("edi",&DWP(0,"esi","ecx",4));  # pointer to last chunk
 
1485
 
 
1486
        &align  (4);
 
1487
        &set_label("invert");                   # invert order of chunks
 
1488
                &mov    ("eax",&DWP(0,"esi"));
 
1489
                &mov    ("ebx",&DWP(4,"esi"));
 
1490
                &mov    ("ecx",&DWP(0,"edi"));
 
1491
                &mov    ("edx",&DWP(4,"edi"));
 
1492
                &mov    (&DWP(0,"edi"),"eax");
 
1493
                &mov    (&DWP(4,"edi"),"ebx");
 
1494
                &mov    (&DWP(0,"esi"),"ecx");
 
1495
                &mov    (&DWP(4,"esi"),"edx");
 
1496
                &mov    ("eax",&DWP(8,"esi"));
 
1497
                &mov    ("ebx",&DWP(12,"esi"));
 
1498
                &mov    ("ecx",&DWP(8,"edi"));
 
1499
                &mov    ("edx",&DWP(12,"edi"));
 
1500
                &mov    (&DWP(8,"edi"),"eax");
 
1501
                &mov    (&DWP(12,"edi"),"ebx");
 
1502
                &mov    (&DWP(8,"esi"),"ecx");
 
1503
                &mov    (&DWP(12,"esi"),"edx");
 
1504
                &add    ("esi",16);
 
1505
                &sub    ("edi",16);
 
1506
                &cmp    ("esi","edi");
 
1507
        &jne    (&label("invert"));
 
1508
 
 
1509
        &call   (&label("pic_point"));
 
1510
        &set_label("pic_point");
 
1511
        blindpop("ebp");
 
1512
        &lea    ("edi",&DWP(&label("AES_Td")."-".&label("pic_point"),"ebp"));
 
1513
        &lea    ("ebp",&DWP(&label("AES_Te")."-".&label("pic_point"),"ebp"));
 
1514
 
 
1515
        &mov    ("esi",&wparam(2));
 
1516
        &mov    ("ecx",&DWP(240,"esi"));        # pull number of rounds
 
1517
        &dec    ("ecx");
 
1518
        &align  (4);
 
1519
        &set_label("permute");                  # permute the key schedule
 
1520
                &add    ("esi",16);
 
1521
                &deckey (0,"esi","ebp","edi");
 
1522
                &deckey (4,"esi","ebp","edi");
 
1523
                &deckey (8,"esi","ebp","edi");
 
1524
                &deckey (12,"esi","ebp","edi");
 
1525
                &dec    ("ecx");
 
1526
        &jnz    (&label("permute"));
 
1527
 
 
1528
        &xor    ("eax","eax");                  # return success
 
1529
&function_end("AES_set_decrypt_key");
 
1530
 
 
1531
&asm_finish();