~andersk/ubuntu/oneiric/openssl/spurious-reboot

« back to all changes in this revision

Viewing changes to .pc/perl-path.diff/crypto/bn/asm/mo-586.pl

  • Committer: Bazaar Package Importer
  • Author(s): Colin Watson
  • Date: 2011-05-01 23:51:53 UTC
  • mfrom: (11.1.20 sid)
  • Revision ID: james.westby@ubuntu.com-20110501235153-bjcxitndquaezb68
Tags: 1.0.0d-2ubuntu1
* Resynchronise with Debian (LP: #675566).  Remaining changes:
  - debian/libssl1.0.0.postinst:
    + Display a system restart required notification bubble on libssl1.0.0
      upgrade.
    + Use a different priority for libssl1.0.0/restart-services depending
      on whether a desktop, or server dist-upgrade is being performed.
  - debian/{libssl1.0.0-udeb.dirs, control, rules}: Create
    libssl1.0.0-udeb, for the benefit of wget-udeb (no wget-udeb package
    in Debian).
  - debian/{libcrypto1.0.0-udeb.dirs, libssl1.0.0.dirs, libssl1.0.0.files,
    rules}: Move runtime libraries to /lib, for the benefit of
    wpasupplicant.
  - debian/patches/aesni.patch: Backport Intel AES-NI support, now from
    http://rt.openssl.org/Ticket/Display.html?id=2065 rather than the
    0.9.8 variant.
  - debian/patches/Bsymbolic-functions.patch: Link using
    -Bsymbolic-functions.
  - debian/patches/perlpath-quilt.patch: Don't change perl #! paths under
    .pc.
  - debian/rules:
    + Don't run 'make test' when cross-building.
    + Use host compiler when cross-building.  Patch from Neil Williams.
    + Don't build for processors no longer supported: i486, i586 (on
      i386), v8 (on sparc).
    + Fix Makefile to properly clean up libs/ dirs in clean target.
    + Replace duplicate files in the doc directory with symlinks.
* Update architectures affected by Bsymbolic-functions.patch.
* Drop debian/patches/no-sslv2.patch; Debian now adds the 'no-ssl2'
  configure option, which compiles out SSLv2 support entirely, so this is
  no longer needed.
* Drop openssl-doc in favour of the libssl-doc package introduced by
  Debian.  Add Conflicts/Replaces until the next LTS release.

Show diffs side-by-side

added added

removed removed

Lines of Context:
1
 
#!/usr/bin/env perl
2
 
 
3
 
# This is crypto/bn/asm/x86-mont.pl (with asciz from crypto/perlasm/x86asm.pl)
4
 
# from OpenSSL 0.9.9-dev 
5
 
 
6
 
sub ::asciz
7
 
{ my @str=unpack("C*",shift);
8
 
    push @str,0;
9
 
    while ($#str>15) {
10
 
        &data_byte(@str[0..15]);
11
 
        foreach (0..15) { shift @str; }
12
 
    }
13
 
    &data_byte(@str) if (@str);
14
 
}
15
 
 
16
 
# ====================================================================
17
 
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
18
 
# project. The module is, however, dual licensed under OpenSSL and
19
 
# CRYPTOGAMS licenses depending on where you obtain it. For further
20
 
# details see http://www.openssl.org/~appro/cryptogams/.
21
 
# ====================================================================
22
 
 
23
 
# October 2005
24
 
#
25
 
# This is a "teaser" code, as it can be improved in several ways...
26
 
# First of all non-SSE2 path should be implemented (yes, for now it
27
 
# performs Montgomery multiplication/convolution only on SSE2-capable
28
 
# CPUs such as P4, others fall down to original code). Then inner loop
29
 
# can be unrolled and modulo-scheduled to improve ILP and possibly
30
 
# moved to 128-bit XMM register bank (though it would require input
31
 
# rearrangement and/or increase bus bandwidth utilization). Dedicated
32
 
# squaring procedure should give further performance improvement...
33
 
# Yet, for being draft, the code improves rsa512 *sign* benchmark by
34
 
# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
35
 
 
36
 
# December 2006
37
 
#
38
 
# Modulo-scheduling SSE2 loops results in further 15-20% improvement.
39
 
# Integer-only code [being equipped with dedicated squaring procedure]
40
 
# gives ~40% on rsa512 sign benchmark...
41
 
 
42
 
push(@INC,"perlasm","../../perlasm");
43
 
require "x86asm.pl";
44
 
 
45
 
&asm_init($ARGV[0],$0);
46
 
 
47
 
$sse2=0;
48
 
for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
49
 
 
50
 
&external_label("OPENSSL_ia32cap_P") if ($sse2);
51
 
 
52
 
&function_begin("bn_mul_mont");
53
 
 
54
 
$i="edx";
55
 
$j="ecx";
56
 
$ap="esi";      $tp="esi";              # overlapping variables!!!
57
 
$rp="edi";      $bp="edi";              # overlapping variables!!!
58
 
$np="ebp";
59
 
$num="ebx";
60
 
 
61
 
$_num=&DWP(4*0,"esp");                  # stack top layout
62
 
$_rp=&DWP(4*1,"esp");
63
 
$_ap=&DWP(4*2,"esp");
64
 
$_bp=&DWP(4*3,"esp");
65
 
$_np=&DWP(4*4,"esp");
66
 
$_n0=&DWP(4*5,"esp");   $_n0q=&QWP(4*5,"esp");
67
 
$_sp=&DWP(4*6,"esp");
68
 
$_bpend=&DWP(4*7,"esp");
69
 
$frame=32;                              # size of above frame rounded up to 16n
70
 
 
71
 
        &xor    ("eax","eax");
72
 
        &mov    ("edi",&wparam(5));     # int num
73
 
        &cmp    ("edi",4);
74
 
        &jl     (&label("just_leave"));
75
 
 
76
 
        &lea    ("esi",&wparam(0));     # put aside pointer to argument block
77
 
        &lea    ("edx",&wparam(1));     # load ap
78
 
        &mov    ("ebp","esp");          # saved stack pointer!
79
 
        &add    ("edi",2);              # extra two words on top of tp
80
 
        &neg    ("edi");
81
 
        &lea    ("esp",&DWP(-$frame,"esp","edi",4));    # alloca($frame+4*(num+2))
82
 
        &neg    ("edi");
83
 
 
84
 
        # minimize cache contention by arraning 2K window between stack
85
 
        # pointer and ap argument [np is also position sensitive vector,
86
 
        # but it's assumed to be near ap, as it's allocated at ~same
87
 
        # time].
88
 
        &mov    ("eax","esp");
89
 
        &sub    ("eax","edx");
90
 
        &and    ("eax",2047);
91
 
        &sub    ("esp","eax");          # this aligns sp and ap modulo 2048
92
 
 
93
 
        &xor    ("edx","esp");
94
 
        &and    ("edx",2048);
95
 
        &xor    ("edx",2048);
96
 
        &sub    ("esp","edx");          # this splits them apart modulo 4096
97
 
 
98
 
        &and    ("esp",-64);            # align to cache line
99
 
 
100
 
        ################################# load argument block...
101
 
        &mov    ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
102
 
        &mov    ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
103
 
        &mov    ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
104
 
        &mov    ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np
105
 
        &mov    ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
106
 
        #&mov   ("edi",&DWP(5*4,"esi"));# int num
107
 
 
108
 
        &mov    ("esi",&DWP(0,"esi"));  # pull n0[0]
109
 
        &mov    ($_rp,"eax");           # ... save a copy of argument block
110
 
        &mov    ($_ap,"ebx");
111
 
        &mov    ($_bp,"ecx");
112
 
        &mov    ($_np,"edx");
113
 
        &mov    ($_n0,"esi");
114
 
        &lea    ($num,&DWP(-3,"edi"));  # num=num-1 to assist modulo-scheduling
115
 
        #&mov   ($_num,$num);           # redundant as $num is not reused
116
 
        &mov    ($_sp,"ebp");           # saved stack pointer!
117
 
 
118
 
if($sse2) {
119
 
$acc0="mm0";    # mmx register bank layout
120
 
$acc1="mm1";
121
 
$car0="mm2";
122
 
$car1="mm3";
123
 
$mul0="mm4";
124
 
$mul1="mm5";
125
 
$temp="mm6";
126
 
$mask="mm7";
127
 
 
128
 
        &picmeup("eax","OPENSSL_ia32cap_P");
129
 
        &bt     (&DWP(0,"eax"),26);
130
 
        &jnc    (&label("non_sse2"));
131
 
 
132
 
        &mov    ("eax",-1);
133
 
        &movd   ($mask,"eax");          # mask 32 lower bits
134
 
 
135
 
        &mov    ($ap,$_ap);             # load input pointers
136
 
        &mov    ($bp,$_bp);
137
 
        &mov    ($np,$_np);
138
 
 
139
 
        &xor    ($i,$i);                # i=0
140
 
        &xor    ($j,$j);                # j=0
141
 
 
142
 
        &movd   ($mul0,&DWP(0,$bp));            # bp[0]
143
 
        &movd   ($mul1,&DWP(0,$ap));            # ap[0]
144
 
        &movd   ($car1,&DWP(0,$np));            # np[0]
145
 
 
146
 
        &pmuludq($mul1,$mul0);                  # ap[0]*bp[0]
147
 
        &movq   ($car0,$mul1);
148
 
        &movq   ($acc0,$mul1);                  # I wish movd worked for
149
 
        &pand   ($acc0,$mask);                  # inter-register transfers
150
 
 
151
 
        &pmuludq($mul1,$_n0q);                  # *=n0
152
 
 
153
 
        &pmuludq($car1,$mul1);                  # "t[0]"*np[0]*n0
154
 
        &paddq  ($car1,$acc0);
155
 
 
156
 
        &movd   ($acc1,&DWP(4,$np));            # np[1]
157
 
        &movd   ($acc0,&DWP(4,$ap));            # ap[1]
158
 
 
159
 
        &psrlq  ($car0,32);
160
 
        &psrlq  ($car1,32);
161
 
 
162
 
        &inc    ($j);                           # j++
163
 
&set_label("1st",16);
164
 
        &pmuludq($acc0,$mul0);                  # ap[j]*bp[0]
165
 
        &pmuludq($acc1,$mul1);                  # np[j]*m1
166
 
        &paddq  ($car0,$acc0);                  # +=c0
167
 
        &paddq  ($car1,$acc1);                  # +=c1
168
 
 
169
 
        &movq   ($acc0,$car0);
170
 
        &pand   ($acc0,$mask);
171
 
        &movd   ($acc1,&DWP(4,$np,$j,4));       # np[j+1]
172
 
        &paddq  ($car1,$acc0);                  # +=ap[j]*bp[0];
173
 
        &movd   ($acc0,&DWP(4,$ap,$j,4));       # ap[j+1]
174
 
        &psrlq  ($car0,32);
175
 
        &movd   (&DWP($frame-4,"esp",$j,4),$car1);      # tp[j-1]=
176
 
        &psrlq  ($car1,32);
177
 
 
178
 
        &lea    ($j,&DWP(1,$j));
179
 
        &cmp    ($j,$num);
180
 
        &jl     (&label("1st"));
181
 
 
182
 
        &pmuludq($acc0,$mul0);                  # ap[num-1]*bp[0]
183
 
        &pmuludq($acc1,$mul1);                  # np[num-1]*m1
184
 
        &paddq  ($car0,$acc0);                  # +=c0
185
 
        &paddq  ($car1,$acc1);                  # +=c1
186
 
 
187
 
        &movq   ($acc0,$car0);
188
 
        &pand   ($acc0,$mask);
189
 
        &paddq  ($car1,$acc0);                  # +=ap[num-1]*bp[0];
190
 
        &movd   (&DWP($frame-4,"esp",$j,4),$car1);      # tp[num-2]=
191
 
 
192
 
        &psrlq  ($car0,32);
193
 
        &psrlq  ($car1,32);
194
 
 
195
 
        &paddq  ($car1,$car0);
196
 
        &movq   (&QWP($frame,"esp",$num,4),$car1);      # tp[num].tp[num-1]
197
 
 
198
 
        &inc    ($i);                           # i++
199
 
&set_label("outer");
200
 
        &xor    ($j,$j);                        # j=0
201
 
 
202
 
        &movd   ($mul0,&DWP(0,$bp,$i,4));       # bp[i]
203
 
        &movd   ($mul1,&DWP(0,$ap));            # ap[0]
204
 
        &movd   ($temp,&DWP($frame,"esp"));     # tp[0]
205
 
        &movd   ($car1,&DWP(0,$np));            # np[0]
206
 
        &pmuludq($mul1,$mul0);                  # ap[0]*bp[i]
207
 
 
208
 
        &paddq  ($mul1,$temp);                  # +=tp[0]
209
 
        &movq   ($acc0,$mul1);
210
 
        &movq   ($car0,$mul1);
211
 
        &pand   ($acc0,$mask);
212
 
 
213
 
        &pmuludq($mul1,$_n0q);                  # *=n0
214
 
 
215
 
        &pmuludq($car1,$mul1);
216
 
        &paddq  ($car1,$acc0);
217
 
 
218
 
        &movd   ($temp,&DWP($frame+4,"esp"));   # tp[1]
219
 
        &movd   ($acc1,&DWP(4,$np));            # np[1]
220
 
        &movd   ($acc0,&DWP(4,$ap));            # ap[1]
221
 
 
222
 
        &psrlq  ($car0,32);
223
 
        &psrlq  ($car1,32);
224
 
        &paddq  ($car0,$temp);                  # +=tp[1]
225
 
 
226
 
        &inc    ($j);                           # j++
227
 
        &dec    ($num);
228
 
&set_label("inner");
229
 
        &pmuludq($acc0,$mul0);                  # ap[j]*bp[i]
230
 
        &pmuludq($acc1,$mul1);                  # np[j]*m1
231
 
        &paddq  ($car0,$acc0);                  # +=c0
232
 
        &paddq  ($car1,$acc1);                  # +=c1
233
 
 
234
 
        &movq   ($acc0,$car0);
235
 
        &movd   ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
236
 
        &pand   ($acc0,$mask);
237
 
        &movd   ($acc1,&DWP(4,$np,$j,4));       # np[j+1]
238
 
        &paddq  ($car1,$acc0);                  # +=ap[j]*bp[i]+tp[j]
239
 
        &movd   ($acc0,&DWP(4,$ap,$j,4));       # ap[j+1]
240
 
        &psrlq  ($car0,32);
241
 
        &movd   (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
242
 
        &psrlq  ($car1,32);
243
 
        &paddq  ($car0,$temp);                  # +=tp[j+1]
244
 
 
245
 
        &dec    ($num);
246
 
        &lea    ($j,&DWP(1,$j));                # j++
247
 
        &jnz    (&label("inner"));
248
 
 
249
 
        &mov    ($num,$j);
250
 
        &pmuludq($acc0,$mul0);                  # ap[num-1]*bp[i]
251
 
        &pmuludq($acc1,$mul1);                  # np[num-1]*m1
252
 
        &paddq  ($car0,$acc0);                  # +=c0
253
 
        &paddq  ($car1,$acc1);                  # +=c1
254
 
 
255
 
        &movq   ($acc0,$car0);
256
 
        &pand   ($acc0,$mask);
257
 
        &paddq  ($car1,$acc0);                  # +=ap[num-1]*bp[i]+tp[num-1]
258
 
        &movd   (&DWP($frame-4,"esp",$j,4),$car1);      # tp[num-2]=
259
 
        &psrlq  ($car0,32);
260
 
        &psrlq  ($car1,32);
261
 
 
262
 
        &movd   ($temp,&DWP($frame+4,"esp",$num,4));    # += tp[num]
263
 
        &paddq  ($car1,$car0);
264
 
        &paddq  ($car1,$temp);
265
 
        &movq   (&QWP($frame,"esp",$num,4),$car1);      # tp[num].tp[num-1]
266
 
 
267
 
        &lea    ($i,&DWP(1,$i));                # i++
268
 
        &cmp    ($i,$num);
269
 
        &jle    (&label("outer"));
270
 
 
271
 
        &emms   ();                             # done with mmx bank
272
 
        &jmp    (&label("common_tail"));
273
 
 
274
 
&set_label("non_sse2",16);
275
 
}
276
 
 
277
 
if (0) {
278
 
        &mov    ("esp",$_sp);
279
 
        &xor    ("eax","eax");  # signal "not fast enough [yet]"
280
 
        &jmp    (&label("just_leave"));
281
 
        # While the below code provides competitive performance for
282
 
        # all key lengthes on modern Intel cores, it's still more
283
 
        # than 10% slower for 4096-bit key elsewhere:-( "Competitive"
284
 
        # means compared to the original integer-only assembler.
285
 
        # 512-bit RSA sign is better by ~40%, but that's about all
286
 
        # one can say about all CPUs...
287
 
} else {
288
 
$inp="esi";     # integer path uses these registers differently
289
 
$word="edi";
290
 
$carry="ebp";
291
 
 
292
 
        &mov    ($inp,$_ap);
293
 
        &lea    ($carry,&DWP(1,$num));
294
 
        &mov    ($word,$_bp);
295
 
        &xor    ($j,$j);                                # j=0
296
 
        &mov    ("edx",$inp);
297
 
        &and    ($carry,1);                             # see if num is even
298
 
        &sub    ("edx",$word);                          # see if ap==bp
299
 
        &lea    ("eax",&DWP(4,$word,$num,4));           # &bp[num]
300
 
        &or     ($carry,"edx");
301
 
        &mov    ($word,&DWP(0,$word));                  # bp[0]
302
 
        &jz     (&label("bn_sqr_mont"));
303
 
        &mov    ($_bpend,"eax");
304
 
        &mov    ("eax",&DWP(0,$inp));
305
 
        &xor    ("edx","edx");
306
 
 
307
 
&set_label("mull",16);
308
 
        &mov    ($carry,"edx");
309
 
        &mul    ($word);                                # ap[j]*bp[0]
310
 
        &add    ($carry,"eax");
311
 
        &lea    ($j,&DWP(1,$j));
312
 
        &adc    ("edx",0);
313
 
        &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j+1]
314
 
        &cmp    ($j,$num);
315
 
        &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
316
 
        &jl     (&label("mull"));
317
 
 
318
 
        &mov    ($carry,"edx");
319
 
        &mul    ($word);                                # ap[num-1]*bp[0]
320
 
         &mov   ($word,$_n0);
321
 
        &add    ("eax",$carry);
322
 
         &mov   ($inp,$_np);
323
 
        &adc    ("edx",0);
324
 
         &imul  ($word,&DWP($frame,"esp"));             # n0*tp[0]
325
 
 
326
 
        &mov    (&DWP($frame,"esp",$num,4),"eax");      # tp[num-1]=
327
 
        &xor    ($j,$j);
328
 
        &mov    (&DWP($frame+4,"esp",$num,4),"edx");    # tp[num]=
329
 
        &mov    (&DWP($frame+8,"esp",$num,4),$j);       # tp[num+1]=
330
 
 
331
 
        &mov    ("eax",&DWP(0,$inp));                   # np[0]
332
 
        &mul    ($word);                                # np[0]*m
333
 
        &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
334
 
        &mov    ("eax",&DWP(4,$inp));                   # np[1]
335
 
        &adc    ("edx",0);
336
 
        &inc    ($j);
337
 
 
338
 
        &jmp    (&label("2ndmadd"));
339
 
 
340
 
&set_label("1stmadd",16);
341
 
        &mov    ($carry,"edx");
342
 
        &mul    ($word);                                # ap[j]*bp[i]
343
 
        &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
344
 
        &lea    ($j,&DWP(1,$j));
345
 
        &adc    ("edx",0);
346
 
        &add    ($carry,"eax");
347
 
        &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j+1]
348
 
        &adc    ("edx",0);
349
 
        &cmp    ($j,$num);
350
 
        &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
351
 
        &jl     (&label("1stmadd"));
352
 
 
353
 
        &mov    ($carry,"edx");
354
 
        &mul    ($word);                                # ap[num-1]*bp[i]
355
 
        &add    ("eax",&DWP($frame,"esp",$num,4));      # +=tp[num-1]
356
 
         &mov   ($word,$_n0);
357
 
        &adc    ("edx",0);
358
 
         &mov   ($inp,$_np);
359
 
        &add    ($carry,"eax");
360
 
        &adc    ("edx",0);
361
 
         &imul  ($word,&DWP($frame,"esp"));             # n0*tp[0]
362
 
 
363
 
        &xor    ($j,$j);
364
 
        &add    ("edx",&DWP($frame+4,"esp",$num,4));    # carry+=tp[num]
365
 
        &mov    (&DWP($frame,"esp",$num,4),$carry);     # tp[num-1]=
366
 
        &adc    ($j,0);
367
 
         &mov   ("eax",&DWP(0,$inp));                   # np[0]
368
 
        &mov    (&DWP($frame+4,"esp",$num,4),"edx");    # tp[num]=
369
 
        &mov    (&DWP($frame+8,"esp",$num,4),$j);       # tp[num+1]=
370
 
 
371
 
        &mul    ($word);                                # np[0]*m
372
 
        &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
373
 
        &mov    ("eax",&DWP(4,$inp));                   # np[1]
374
 
        &adc    ("edx",0);
375
 
        &mov    ($j,1);
376
 
 
377
 
&set_label("2ndmadd",16);
378
 
        &mov    ($carry,"edx");
379
 
        &mul    ($word);                                # np[j]*m
380
 
        &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
381
 
        &lea    ($j,&DWP(1,$j));
382
 
        &adc    ("edx",0);
383
 
        &add    ($carry,"eax");
384
 
        &mov    ("eax",&DWP(0,$inp,$j,4));              # np[j+1]
385
 
        &adc    ("edx",0);
386
 
        &cmp    ($j,$num);
387
 
        &mov    (&DWP($frame-8,"esp",$j,4),$carry);     # tp[j-1]=
388
 
        &jl     (&label("2ndmadd"));
389
 
 
390
 
        &mov    ($carry,"edx");
391
 
        &mul    ($word);                                # np[j]*m
392
 
        &add    ($carry,&DWP($frame,"esp",$num,4));     # +=tp[num-1]
393
 
        &adc    ("edx",0);
394
 
        &add    ($carry,"eax");
395
 
        &adc    ("edx",0);
396
 
        &mov    (&DWP($frame-4,"esp",$num,4),$carry);   # tp[num-2]=
397
 
 
398
 
        &xor    ("eax","eax");
399
 
         &mov   ($j,$_bp);                              # &bp[i]
400
 
        &add    ("edx",&DWP($frame+4,"esp",$num,4));    # carry+=tp[num]
401
 
        &adc    ("eax",&DWP($frame+8,"esp",$num,4));    # +=tp[num+1]
402
 
         &lea   ($j,&DWP(4,$j));
403
 
        &mov    (&DWP($frame,"esp",$num,4),"edx");      # tp[num-1]=
404
 
         &cmp   ($j,$_bpend);
405
 
        &mov    (&DWP($frame+4,"esp",$num,4),"eax");    # tp[num]=
406
 
        &je     (&label("common_tail"));
407
 
 
408
 
        &mov    ($word,&DWP(0,$j));                     # bp[i+1]
409
 
        &mov    ($inp,$_ap);
410
 
        &mov    ($_bp,$j);                              # &bp[++i]
411
 
        &xor    ($j,$j);
412
 
        &xor    ("edx","edx");
413
 
        &mov    ("eax",&DWP(0,$inp));
414
 
        &jmp    (&label("1stmadd"));
415
 
 
416
 
&set_label("bn_sqr_mont",16);
417
 
$sbit=$num;
418
 
        &mov    ($_num,$num);
419
 
        &mov    ($_bp,$j);                              # i=0
420
 
 
421
 
        &mov    ("eax",$word);                          # ap[0]
422
 
        &mul    ($word);                                # ap[0]*ap[0]
423
 
        &mov    (&DWP($frame,"esp"),"eax");             # tp[0]=
424
 
        &mov    ($sbit,"edx");
425
 
        &shr    ("edx",1);
426
 
        &and    ($sbit,1);
427
 
        &inc    ($j);
428
 
&set_label("sqr",16);
429
 
        &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j]
430
 
        &mov    ($carry,"edx");
431
 
        &mul    ($word);                                # ap[j]*ap[0]
432
 
        &add    ("eax",$carry);
433
 
        &lea    ($j,&DWP(1,$j));
434
 
        &adc    ("edx",0);
435
 
        &lea    ($carry,&DWP(0,$sbit,"eax",2));
436
 
        &shr    ("eax",31);
437
 
        &cmp    ($j,$_num);
438
 
        &mov    ($sbit,"eax");
439
 
        &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
440
 
        &jl     (&label("sqr"));
441
 
 
442
 
        &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[num-1]
443
 
        &mov    ($carry,"edx");
444
 
        &mul    ($word);                                # ap[num-1]*ap[0]
445
 
        &add    ("eax",$carry);
446
 
         &mov   ($word,$_n0);
447
 
        &adc    ("edx",0);
448
 
         &mov   ($inp,$_np);
449
 
        &lea    ($carry,&DWP(0,$sbit,"eax",2));
450
 
         &imul  ($word,&DWP($frame,"esp"));             # n0*tp[0]
451
 
        &shr    ("eax",31);
452
 
        &mov    (&DWP($frame,"esp",$j,4),$carry);       # tp[num-1]=
453
 
 
454
 
        &lea    ($carry,&DWP(0,"eax","edx",2));
455
 
         &mov   ("eax",&DWP(0,$inp));                   # np[0]
456
 
        &shr    ("edx",31);
457
 
        &mov    (&DWP($frame+4,"esp",$j,4),$carry);     # tp[num]=
458
 
        &mov    (&DWP($frame+8,"esp",$j,4),"edx");      # tp[num+1]=
459
 
 
460
 
        &mul    ($word);                                # np[0]*m
461
 
        &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
462
 
        &mov    ($num,$j);
463
 
        &adc    ("edx",0);
464
 
        &mov    ("eax",&DWP(4,$inp));                   # np[1]
465
 
        &mov    ($j,1);
466
 
 
467
 
&set_label("3rdmadd",16);
468
 
        &mov    ($carry,"edx");
469
 
        &mul    ($word);                                # np[j]*m
470
 
        &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
471
 
        &adc    ("edx",0);
472
 
        &add    ($carry,"eax");
473
 
        &mov    ("eax",&DWP(4,$inp,$j,4));              # np[j+1]
474
 
        &adc    ("edx",0);
475
 
        &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j-1]=
476
 
 
477
 
        &mov    ($carry,"edx");
478
 
        &mul    ($word);                                # np[j+1]*m
479
 
        &add    ($carry,&DWP($frame+4,"esp",$j,4));     # +=tp[j+1]
480
 
        &lea    ($j,&DWP(2,$j));
481
 
        &adc    ("edx",0);
482
 
        &add    ($carry,"eax");
483
 
        &mov    ("eax",&DWP(0,$inp,$j,4));              # np[j+2]
484
 
        &adc    ("edx",0);
485
 
        &cmp    ($j,$num);
486
 
        &mov    (&DWP($frame-8,"esp",$j,4),$carry);     # tp[j]=
487
 
        &jl     (&label("3rdmadd"));
488
 
 
489
 
        &mov    ($carry,"edx");
490
 
        &mul    ($word);                                # np[j]*m
491
 
        &add    ($carry,&DWP($frame,"esp",$num,4));     # +=tp[num-1]
492
 
        &adc    ("edx",0);
493
 
        &add    ($carry,"eax");
494
 
        &adc    ("edx",0);
495
 
        &mov    (&DWP($frame-4,"esp",$num,4),$carry);   # tp[num-2]=
496
 
 
497
 
        &mov    ($j,$_bp);                              # i
498
 
        &xor    ("eax","eax");
499
 
        &mov    ($inp,$_ap);
500
 
        &add    ("edx",&DWP($frame+4,"esp",$num,4));    # carry+=tp[num]
501
 
        &adc    ("eax",&DWP($frame+8,"esp",$num,4));    # +=tp[num+1]
502
 
        &mov    (&DWP($frame,"esp",$num,4),"edx");      # tp[num-1]=
503
 
        &cmp    ($j,$num);
504
 
        &mov    (&DWP($frame+4,"esp",$num,4),"eax");    # tp[num]=
505
 
        &je     (&label("common_tail"));
506
 
 
507
 
        &mov    ($word,&DWP(4,$inp,$j,4));              # ap[i]
508
 
        &lea    ($j,&DWP(1,$j));
509
 
        &mov    ("eax",$word);
510
 
        &mov    ($_bp,$j);                              # ++i
511
 
        &mul    ($word);                                # ap[i]*ap[i]
512
 
        &add    ("eax",&DWP($frame,"esp",$j,4));        # +=tp[i]
513
 
        &adc    ("edx",0);
514
 
        &mov    (&DWP($frame,"esp",$j,4),"eax");        # tp[i]=
515
 
        &xor    ($carry,$carry);
516
 
        &cmp    ($j,$num);
517
 
        &lea    ($j,&DWP(1,$j));
518
 
        &je     (&label("sqrlast"));
519
 
 
520
 
        &mov    ($sbit,"edx");                          # zaps $num
521
 
        &shr    ("edx",1);
522
 
        &and    ($sbit,1);
523
 
&set_label("sqradd",16);
524
 
        &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j]
525
 
        &mov    ($carry,"edx");
526
 
        &mul    ($word);                                # ap[j]*ap[i]
527
 
        &add    ("eax",$carry);
528
 
        &lea    ($carry,&DWP(0,"eax","eax"));
529
 
        &adc    ("edx",0);
530
 
        &shr    ("eax",31);
531
 
        &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
532
 
        &lea    ($j,&DWP(1,$j));
533
 
        &adc    ("eax",0);
534
 
        &add    ($carry,$sbit);
535
 
        &adc    ("eax",0);
536
 
        &cmp    ($j,$_num);
537
 
        &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
538
 
        &mov    ($sbit,"eax");
539
 
        &jle    (&label("sqradd"));
540
 
 
541
 
        &mov    ($carry,"edx");
542
 
        &lea    ("edx",&DWP(0,$sbit,"edx",2));
543
 
        &shr    ($carry,31);
544
 
&set_label("sqrlast");
545
 
        &mov    ($word,$_n0);
546
 
        &mov    ($inp,$_np);
547
 
        &imul   ($word,&DWP($frame,"esp"));             # n0*tp[0]
548
 
 
549
 
        &add    ("edx",&DWP($frame,"esp",$j,4));        # +=tp[num]
550
 
        &mov    ("eax",&DWP(0,$inp));                   # np[0]
551
 
        &adc    ($carry,0);
552
 
        &mov    (&DWP($frame,"esp",$j,4),"edx");        # tp[num]=
553
 
        &mov    (&DWP($frame+4,"esp",$j,4),$carry);     # tp[num+1]=
554
 
 
555
 
        &mul    ($word);                                # np[0]*m
556
 
        &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
557
 
        &lea    ($num,&DWP(-1,$j));
558
 
        &adc    ("edx",0);
559
 
        &mov    ($j,1);
560
 
        &mov    ("eax",&DWP(4,$inp));                   # np[1]
561
 
 
562
 
        &jmp    (&label("3rdmadd"));
563
 
}
564
 
 
565
 
&set_label("common_tail",16);
566
 
        &mov    ($np,$_np);                     # load modulus pointer
567
 
        &mov    ($rp,$_rp);                     # load result pointer
568
 
        &lea    ($tp,&DWP($frame,"esp"));       # [$ap and $bp are zapped]
569
 
 
570
 
        &mov    ("eax",&DWP(0,$tp));            # tp[0]
571
 
        &mov    ($j,$num);                      # j=num-1
572
 
        &xor    ($i,$i);                        # i=0 and clear CF!
573
 
 
574
 
&set_label("sub",16);
575
 
        &sbb    ("eax",&DWP(0,$np,$i,4));
576
 
        &mov    (&DWP(0,$rp,$i,4),"eax");       # rp[i]=tp[i]-np[i]
577
 
        &dec    ($j);                           # doesn't affect CF!
578
 
        &mov    ("eax",&DWP(4,$tp,$i,4));       # tp[i+1]
579
 
        &lea    ($i,&DWP(1,$i));                # i++
580
 
        &jge    (&label("sub"));
581
 
 
582
 
        &sbb    ("eax",0);                      # handle upmost overflow bit
583
 
        &and    ($tp,"eax");
584
 
        &not    ("eax");
585
 
        &mov    ($np,$rp);
586
 
        &and    ($np,"eax");
587
 
        &or     ($tp,$np);                      # tp=carry?tp:rp
588
 
 
589
 
&set_label("copy",16);                          # copy or in-place refresh
590
 
        &mov    ("eax",&DWP(0,$tp,$num,4));
591
 
        &mov    (&DWP(0,$rp,$num,4),"eax");     # rp[i]=tp[i]
592
 
        &mov    (&DWP($frame,"esp",$num,4),$j); # zap temporary vector
593
 
        &dec    ($num);
594
 
        &jge    (&label("copy"));
595
 
 
596
 
        &mov    ("esp",$_sp);           # pull saved stack pointer
597
 
        &mov    ("eax",1);
598
 
&set_label("just_leave");
599
 
&function_end("bn_mul_mont");
600
 
 
601
 
&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
602
 
 
603
 
&asm_finish();