~ubuntu-branches/ubuntu/lucid/openssl/lucid-proposed

« back to all changes in this revision

Viewing changes to crypto/bn/asm/mo-586.pl

  • Committer: Bazaar Package Importer
  • Author(s): Kurt Roeckx
  • Date: 2009-06-13 18:15:46 UTC
  • mto: (11.1.5 squeeze)
  • mto: This revision was merged to the branch mainline in revision 34.
  • Revision ID: james.westby@ubuntu.com-20090613181546-vbfntai3b009dl1u
Tags: upstream-0.9.8k
ImportĀ upstreamĀ versionĀ 0.9.8k

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
#!/usr/bin/env perl
 
2
 
 
3
# This is crypto/bn/asm/x86-mont.pl (with asciz from crypto/perlasm/x86asm.pl)
 
4
# from OpenSSL 0.9.9-dev 
 
5
 
 
6
sub ::asciz
 
7
{ my @str=unpack("C*",shift);
 
8
    push @str,0;
 
9
    while ($#str>15) {
 
10
        &data_byte(@str[0..15]);
 
11
        foreach (0..15) { shift @str; }
 
12
    }
 
13
    &data_byte(@str) if (@str);
 
14
}
 
15
 
 
16
# ====================================================================
 
17
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
 
18
# project. The module is, however, dual licensed under OpenSSL and
 
19
# CRYPTOGAMS licenses depending on where you obtain it. For further
 
20
# details see http://www.openssl.org/~appro/cryptogams/.
 
21
# ====================================================================
 
22
 
 
23
# October 2005
 
24
#
 
25
# This is a "teaser" code, as it can be improved in several ways...
 
26
# First of all non-SSE2 path should be implemented (yes, for now it
 
27
# performs Montgomery multiplication/convolution only on SSE2-capable
 
28
# CPUs such as P4, others fall down to original code). Then inner loop
 
29
# can be unrolled and modulo-scheduled to improve ILP and possibly
 
30
# moved to 128-bit XMM register bank (though it would require input
 
31
# rearrangement and/or increase bus bandwidth utilization). Dedicated
 
32
# squaring procedure should give further performance improvement...
 
33
# Yet, for being draft, the code improves rsa512 *sign* benchmark by
 
34
# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
 
35
 
 
36
# December 2006
 
37
#
 
38
# Modulo-scheduling SSE2 loops results in further 15-20% improvement.
 
39
# Integer-only code [being equipped with dedicated squaring procedure]
 
40
# gives ~40% on rsa512 sign benchmark...
 
41
 
 
42
push(@INC,"perlasm","../../perlasm");
 
43
require "x86asm.pl";
 
44
 
 
45
&asm_init($ARGV[0],$0);
 
46
 
 
47
$sse2=0;
 
48
for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
 
49
 
 
50
&external_label("OPENSSL_ia32cap_P") if ($sse2);
 
51
 
 
52
&function_begin("bn_mul_mont");
 
53
 
 
54
$i="edx";
 
55
$j="ecx";
 
56
$ap="esi";      $tp="esi";              # overlapping variables!!!
 
57
$rp="edi";      $bp="edi";              # overlapping variables!!!
 
58
$np="ebp";
 
59
$num="ebx";
 
60
 
 
61
$_num=&DWP(4*0,"esp");                  # stack top layout
 
62
$_rp=&DWP(4*1,"esp");
 
63
$_ap=&DWP(4*2,"esp");
 
64
$_bp=&DWP(4*3,"esp");
 
65
$_np=&DWP(4*4,"esp");
 
66
$_n0=&DWP(4*5,"esp");   $_n0q=&QWP(4*5,"esp");
 
67
$_sp=&DWP(4*6,"esp");
 
68
$_bpend=&DWP(4*7,"esp");
 
69
$frame=32;                              # size of above frame rounded up to 16n
 
70
 
 
71
        &xor    ("eax","eax");
 
72
        &mov    ("edi",&wparam(5));     # int num
 
73
        &cmp    ("edi",4);
 
74
        &jl     (&label("just_leave"));
 
75
 
 
76
        &lea    ("esi",&wparam(0));     # put aside pointer to argument block
 
77
        &lea    ("edx",&wparam(1));     # load ap
 
78
        &mov    ("ebp","esp");          # saved stack pointer!
 
79
        &add    ("edi",2);              # extra two words on top of tp
 
80
        &neg    ("edi");
 
81
        &lea    ("esp",&DWP(-$frame,"esp","edi",4));    # alloca($frame+4*(num+2))
 
82
        &neg    ("edi");
 
83
 
 
84
        # minimize cache contention by arraning 2K window between stack
 
85
        # pointer and ap argument [np is also position sensitive vector,
 
86
        # but it's assumed to be near ap, as it's allocated at ~same
 
87
        # time].
 
88
        &mov    ("eax","esp");
 
89
        &sub    ("eax","edx");
 
90
        &and    ("eax",2047);
 
91
        &sub    ("esp","eax");          # this aligns sp and ap modulo 2048
 
92
 
 
93
        &xor    ("edx","esp");
 
94
        &and    ("edx",2048);
 
95
        &xor    ("edx",2048);
 
96
        &sub    ("esp","edx");          # this splits them apart modulo 4096
 
97
 
 
98
        &and    ("esp",-64);            # align to cache line
 
99
 
 
100
        ################################# load argument block...
 
101
        &mov    ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
 
102
        &mov    ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
 
103
        &mov    ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
 
104
        &mov    ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np
 
105
        &mov    ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
 
106
        #&mov   ("edi",&DWP(5*4,"esi"));# int num
 
107
 
 
108
        &mov    ("esi",&DWP(0,"esi"));  # pull n0[0]
 
109
        &mov    ($_rp,"eax");           # ... save a copy of argument block
 
110
        &mov    ($_ap,"ebx");
 
111
        &mov    ($_bp,"ecx");
 
112
        &mov    ($_np,"edx");
 
113
        &mov    ($_n0,"esi");
 
114
        &lea    ($num,&DWP(-3,"edi"));  # num=num-1 to assist modulo-scheduling
 
115
        #&mov   ($_num,$num);           # redundant as $num is not reused
 
116
        &mov    ($_sp,"ebp");           # saved stack pointer!
 
117
 
 
118
if($sse2) {
 
119
$acc0="mm0";    # mmx register bank layout
 
120
$acc1="mm1";
 
121
$car0="mm2";
 
122
$car1="mm3";
 
123
$mul0="mm4";
 
124
$mul1="mm5";
 
125
$temp="mm6";
 
126
$mask="mm7";
 
127
 
 
128
        &picmeup("eax","OPENSSL_ia32cap_P");
 
129
        &bt     (&DWP(0,"eax"),26);
 
130
        &jnc    (&label("non_sse2"));
 
131
 
 
132
        &mov    ("eax",-1);
 
133
        &movd   ($mask,"eax");          # mask 32 lower bits
 
134
 
 
135
        &mov    ($ap,$_ap);             # load input pointers
 
136
        &mov    ($bp,$_bp);
 
137
        &mov    ($np,$_np);
 
138
 
 
139
        &xor    ($i,$i);                # i=0
 
140
        &xor    ($j,$j);                # j=0
 
141
 
 
142
        &movd   ($mul0,&DWP(0,$bp));            # bp[0]
 
143
        &movd   ($mul1,&DWP(0,$ap));            # ap[0]
 
144
        &movd   ($car1,&DWP(0,$np));            # np[0]
 
145
 
 
146
        &pmuludq($mul1,$mul0);                  # ap[0]*bp[0]
 
147
        &movq   ($car0,$mul1);
 
148
        &movq   ($acc0,$mul1);                  # I wish movd worked for
 
149
        &pand   ($acc0,$mask);                  # inter-register transfers
 
150
 
 
151
        &pmuludq($mul1,$_n0q);                  # *=n0
 
152
 
 
153
        &pmuludq($car1,$mul1);                  # "t[0]"*np[0]*n0
 
154
        &paddq  ($car1,$acc0);
 
155
 
 
156
        &movd   ($acc1,&DWP(4,$np));            # np[1]
 
157
        &movd   ($acc0,&DWP(4,$ap));            # ap[1]
 
158
 
 
159
        &psrlq  ($car0,32);
 
160
        &psrlq  ($car1,32);
 
161
 
 
162
        &inc    ($j);                           # j++
 
163
&set_label("1st",16);
 
164
        &pmuludq($acc0,$mul0);                  # ap[j]*bp[0]
 
165
        &pmuludq($acc1,$mul1);                  # np[j]*m1
 
166
        &paddq  ($car0,$acc0);                  # +=c0
 
167
        &paddq  ($car1,$acc1);                  # +=c1
 
168
 
 
169
        &movq   ($acc0,$car0);
 
170
        &pand   ($acc0,$mask);
 
171
        &movd   ($acc1,&DWP(4,$np,$j,4));       # np[j+1]
 
172
        &paddq  ($car1,$acc0);                  # +=ap[j]*bp[0];
 
173
        &movd   ($acc0,&DWP(4,$ap,$j,4));       # ap[j+1]
 
174
        &psrlq  ($car0,32);
 
175
        &movd   (&DWP($frame-4,"esp",$j,4),$car1);      # tp[j-1]=
 
176
        &psrlq  ($car1,32);
 
177
 
 
178
        &lea    ($j,&DWP(1,$j));
 
179
        &cmp    ($j,$num);
 
180
        &jl     (&label("1st"));
 
181
 
 
182
        &pmuludq($acc0,$mul0);                  # ap[num-1]*bp[0]
 
183
        &pmuludq($acc1,$mul1);                  # np[num-1]*m1
 
184
        &paddq  ($car0,$acc0);                  # +=c0
 
185
        &paddq  ($car1,$acc1);                  # +=c1
 
186
 
 
187
        &movq   ($acc0,$car0);
 
188
        &pand   ($acc0,$mask);
 
189
        &paddq  ($car1,$acc0);                  # +=ap[num-1]*bp[0];
 
190
        &movd   (&DWP($frame-4,"esp",$j,4),$car1);      # tp[num-2]=
 
191
 
 
192
        &psrlq  ($car0,32);
 
193
        &psrlq  ($car1,32);
 
194
 
 
195
        &paddq  ($car1,$car0);
 
196
        &movq   (&QWP($frame,"esp",$num,4),$car1);      # tp[num].tp[num-1]
 
197
 
 
198
        &inc    ($i);                           # i++
 
199
&set_label("outer");
 
200
        &xor    ($j,$j);                        # j=0
 
201
 
 
202
        &movd   ($mul0,&DWP(0,$bp,$i,4));       # bp[i]
 
203
        &movd   ($mul1,&DWP(0,$ap));            # ap[0]
 
204
        &movd   ($temp,&DWP($frame,"esp"));     # tp[0]
 
205
        &movd   ($car1,&DWP(0,$np));            # np[0]
 
206
        &pmuludq($mul1,$mul0);                  # ap[0]*bp[i]
 
207
 
 
208
        &paddq  ($mul1,$temp);                  # +=tp[0]
 
209
        &movq   ($acc0,$mul1);
 
210
        &movq   ($car0,$mul1);
 
211
        &pand   ($acc0,$mask);
 
212
 
 
213
        &pmuludq($mul1,$_n0q);                  # *=n0
 
214
 
 
215
        &pmuludq($car1,$mul1);
 
216
        &paddq  ($car1,$acc0);
 
217
 
 
218
        &movd   ($temp,&DWP($frame+4,"esp"));   # tp[1]
 
219
        &movd   ($acc1,&DWP(4,$np));            # np[1]
 
220
        &movd   ($acc0,&DWP(4,$ap));            # ap[1]
 
221
 
 
222
        &psrlq  ($car0,32);
 
223
        &psrlq  ($car1,32);
 
224
        &paddq  ($car0,$temp);                  # +=tp[1]
 
225
 
 
226
        &inc    ($j);                           # j++
 
227
        &dec    ($num);
 
228
&set_label("inner");
 
229
        &pmuludq($acc0,$mul0);                  # ap[j]*bp[i]
 
230
        &pmuludq($acc1,$mul1);                  # np[j]*m1
 
231
        &paddq  ($car0,$acc0);                  # +=c0
 
232
        &paddq  ($car1,$acc1);                  # +=c1
 
233
 
 
234
        &movq   ($acc0,$car0);
 
235
        &movd   ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
 
236
        &pand   ($acc0,$mask);
 
237
        &movd   ($acc1,&DWP(4,$np,$j,4));       # np[j+1]
 
238
        &paddq  ($car1,$acc0);                  # +=ap[j]*bp[i]+tp[j]
 
239
        &movd   ($acc0,&DWP(4,$ap,$j,4));       # ap[j+1]
 
240
        &psrlq  ($car0,32);
 
241
        &movd   (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
 
242
        &psrlq  ($car1,32);
 
243
        &paddq  ($car0,$temp);                  # +=tp[j+1]
 
244
 
 
245
        &dec    ($num);
 
246
        &lea    ($j,&DWP(1,$j));                # j++
 
247
        &jnz    (&label("inner"));
 
248
 
 
249
        &mov    ($num,$j);
 
250
        &pmuludq($acc0,$mul0);                  # ap[num-1]*bp[i]
 
251
        &pmuludq($acc1,$mul1);                  # np[num-1]*m1
 
252
        &paddq  ($car0,$acc0);                  # +=c0
 
253
        &paddq  ($car1,$acc1);                  # +=c1
 
254
 
 
255
        &movq   ($acc0,$car0);
 
256
        &pand   ($acc0,$mask);
 
257
        &paddq  ($car1,$acc0);                  # +=ap[num-1]*bp[i]+tp[num-1]
 
258
        &movd   (&DWP($frame-4,"esp",$j,4),$car1);      # tp[num-2]=
 
259
        &psrlq  ($car0,32);
 
260
        &psrlq  ($car1,32);
 
261
 
 
262
        &movd   ($temp,&DWP($frame+4,"esp",$num,4));    # += tp[num]
 
263
        &paddq  ($car1,$car0);
 
264
        &paddq  ($car1,$temp);
 
265
        &movq   (&QWP($frame,"esp",$num,4),$car1);      # tp[num].tp[num-1]
 
266
 
 
267
        &lea    ($i,&DWP(1,$i));                # i++
 
268
        &cmp    ($i,$num);
 
269
        &jle    (&label("outer"));
 
270
 
 
271
        &emms   ();                             # done with mmx bank
 
272
        &jmp    (&label("common_tail"));
 
273
 
 
274
&set_label("non_sse2",16);
 
275
}
 
276
 
 
277
if (0) {
 
278
        &mov    ("esp",$_sp);
 
279
        &xor    ("eax","eax");  # signal "not fast enough [yet]"
 
280
        &jmp    (&label("just_leave"));
 
281
        # While the below code provides competitive performance for
 
282
        # all key lengthes on modern Intel cores, it's still more
 
283
        # than 10% slower for 4096-bit key elsewhere:-( "Competitive"
 
284
        # means compared to the original integer-only assembler.
 
285
        # 512-bit RSA sign is better by ~40%, but that's about all
 
286
        # one can say about all CPUs...
 
287
} else {
 
288
$inp="esi";     # integer path uses these registers differently
 
289
$word="edi";
 
290
$carry="ebp";
 
291
 
 
292
        &mov    ($inp,$_ap);
 
293
        &lea    ($carry,&DWP(1,$num));
 
294
        &mov    ($word,$_bp);
 
295
        &xor    ($j,$j);                                # j=0
 
296
        &mov    ("edx",$inp);
 
297
        &and    ($carry,1);                             # see if num is even
 
298
        &sub    ("edx",$word);                          # see if ap==bp
 
299
        &lea    ("eax",&DWP(4,$word,$num,4));           # &bp[num]
 
300
        &or     ($carry,"edx");
 
301
        &mov    ($word,&DWP(0,$word));                  # bp[0]
 
302
        &jz     (&label("bn_sqr_mont"));
 
303
        &mov    ($_bpend,"eax");
 
304
        &mov    ("eax",&DWP(0,$inp));
 
305
        &xor    ("edx","edx");
 
306
 
 
307
&set_label("mull",16);
 
308
        &mov    ($carry,"edx");
 
309
        &mul    ($word);                                # ap[j]*bp[0]
 
310
        &add    ($carry,"eax");
 
311
        &lea    ($j,&DWP(1,$j));
 
312
        &adc    ("edx",0);
 
313
        &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j+1]
 
314
        &cmp    ($j,$num);
 
315
        &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
 
316
        &jl     (&label("mull"));
 
317
 
 
318
        &mov    ($carry,"edx");
 
319
        &mul    ($word);                                # ap[num-1]*bp[0]
 
320
         &mov   ($word,$_n0);
 
321
        &add    ("eax",$carry);
 
322
         &mov   ($inp,$_np);
 
323
        &adc    ("edx",0);
 
324
         &imul  ($word,&DWP($frame,"esp"));             # n0*tp[0]
 
325
 
 
326
        &mov    (&DWP($frame,"esp",$num,4),"eax");      # tp[num-1]=
 
327
        &xor    ($j,$j);
 
328
        &mov    (&DWP($frame+4,"esp",$num,4),"edx");    # tp[num]=
 
329
        &mov    (&DWP($frame+8,"esp",$num,4),$j);       # tp[num+1]=
 
330
 
 
331
        &mov    ("eax",&DWP(0,$inp));                   # np[0]
 
332
        &mul    ($word);                                # np[0]*m
 
333
        &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
 
334
        &mov    ("eax",&DWP(4,$inp));                   # np[1]
 
335
        &adc    ("edx",0);
 
336
        &inc    ($j);
 
337
 
 
338
        &jmp    (&label("2ndmadd"));
 
339
 
 
340
&set_label("1stmadd",16);
 
341
        &mov    ($carry,"edx");
 
342
        &mul    ($word);                                # ap[j]*bp[i]
 
343
        &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
 
344
        &lea    ($j,&DWP(1,$j));
 
345
        &adc    ("edx",0);
 
346
        &add    ($carry,"eax");
 
347
        &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j+1]
 
348
        &adc    ("edx",0);
 
349
        &cmp    ($j,$num);
 
350
        &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
 
351
        &jl     (&label("1stmadd"));
 
352
 
 
353
        &mov    ($carry,"edx");
 
354
        &mul    ($word);                                # ap[num-1]*bp[i]
 
355
        &add    ("eax",&DWP($frame,"esp",$num,4));      # +=tp[num-1]
 
356
         &mov   ($word,$_n0);
 
357
        &adc    ("edx",0);
 
358
         &mov   ($inp,$_np);
 
359
        &add    ($carry,"eax");
 
360
        &adc    ("edx",0);
 
361
         &imul  ($word,&DWP($frame,"esp"));             # n0*tp[0]
 
362
 
 
363
        &xor    ($j,$j);
 
364
        &add    ("edx",&DWP($frame+4,"esp",$num,4));    # carry+=tp[num]
 
365
        &mov    (&DWP($frame,"esp",$num,4),$carry);     # tp[num-1]=
 
366
        &adc    ($j,0);
 
367
         &mov   ("eax",&DWP(0,$inp));                   # np[0]
 
368
        &mov    (&DWP($frame+4,"esp",$num,4),"edx");    # tp[num]=
 
369
        &mov    (&DWP($frame+8,"esp",$num,4),$j);       # tp[num+1]=
 
370
 
 
371
        &mul    ($word);                                # np[0]*m
 
372
        &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
 
373
        &mov    ("eax",&DWP(4,$inp));                   # np[1]
 
374
        &adc    ("edx",0);
 
375
        &mov    ($j,1);
 
376
 
 
377
&set_label("2ndmadd",16);
 
378
        &mov    ($carry,"edx");
 
379
        &mul    ($word);                                # np[j]*m
 
380
        &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
 
381
        &lea    ($j,&DWP(1,$j));
 
382
        &adc    ("edx",0);
 
383
        &add    ($carry,"eax");
 
384
        &mov    ("eax",&DWP(0,$inp,$j,4));              # np[j+1]
 
385
        &adc    ("edx",0);
 
386
        &cmp    ($j,$num);
 
387
        &mov    (&DWP($frame-8,"esp",$j,4),$carry);     # tp[j-1]=
 
388
        &jl     (&label("2ndmadd"));
 
389
 
 
390
        &mov    ($carry,"edx");
 
391
        &mul    ($word);                                # np[j]*m
 
392
        &add    ($carry,&DWP($frame,"esp",$num,4));     # +=tp[num-1]
 
393
        &adc    ("edx",0);
 
394
        &add    ($carry,"eax");
 
395
        &adc    ("edx",0);
 
396
        &mov    (&DWP($frame-4,"esp",$num,4),$carry);   # tp[num-2]=
 
397
 
 
398
        &xor    ("eax","eax");
 
399
         &mov   ($j,$_bp);                              # &bp[i]
 
400
        &add    ("edx",&DWP($frame+4,"esp",$num,4));    # carry+=tp[num]
 
401
        &adc    ("eax",&DWP($frame+8,"esp",$num,4));    # +=tp[num+1]
 
402
         &lea   ($j,&DWP(4,$j));
 
403
        &mov    (&DWP($frame,"esp",$num,4),"edx");      # tp[num-1]=
 
404
         &cmp   ($j,$_bpend);
 
405
        &mov    (&DWP($frame+4,"esp",$num,4),"eax");    # tp[num]=
 
406
        &je     (&label("common_tail"));
 
407
 
 
408
        &mov    ($word,&DWP(0,$j));                     # bp[i+1]
 
409
        &mov    ($inp,$_ap);
 
410
        &mov    ($_bp,$j);                              # &bp[++i]
 
411
        &xor    ($j,$j);
 
412
        &xor    ("edx","edx");
 
413
        &mov    ("eax",&DWP(0,$inp));
 
414
        &jmp    (&label("1stmadd"));
 
415
 
 
416
&set_label("bn_sqr_mont",16);
 
417
$sbit=$num;
 
418
        &mov    ($_num,$num);
 
419
        &mov    ($_bp,$j);                              # i=0
 
420
 
 
421
        &mov    ("eax",$word);                          # ap[0]
 
422
        &mul    ($word);                                # ap[0]*ap[0]
 
423
        &mov    (&DWP($frame,"esp"),"eax");             # tp[0]=
 
424
        &mov    ($sbit,"edx");
 
425
        &shr    ("edx",1);
 
426
        &and    ($sbit,1);
 
427
        &inc    ($j);
 
428
&set_label("sqr",16);
 
429
        &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j]
 
430
        &mov    ($carry,"edx");
 
431
        &mul    ($word);                                # ap[j]*ap[0]
 
432
        &add    ("eax",$carry);
 
433
        &lea    ($j,&DWP(1,$j));
 
434
        &adc    ("edx",0);
 
435
        &lea    ($carry,&DWP(0,$sbit,"eax",2));
 
436
        &shr    ("eax",31);
 
437
        &cmp    ($j,$_num);
 
438
        &mov    ($sbit,"eax");
 
439
        &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
 
440
        &jl     (&label("sqr"));
 
441
 
 
442
        &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[num-1]
 
443
        &mov    ($carry,"edx");
 
444
        &mul    ($word);                                # ap[num-1]*ap[0]
 
445
        &add    ("eax",$carry);
 
446
         &mov   ($word,$_n0);
 
447
        &adc    ("edx",0);
 
448
         &mov   ($inp,$_np);
 
449
        &lea    ($carry,&DWP(0,$sbit,"eax",2));
 
450
         &imul  ($word,&DWP($frame,"esp"));             # n0*tp[0]
 
451
        &shr    ("eax",31);
 
452
        &mov    (&DWP($frame,"esp",$j,4),$carry);       # tp[num-1]=
 
453
 
 
454
        &lea    ($carry,&DWP(0,"eax","edx",2));
 
455
         &mov   ("eax",&DWP(0,$inp));                   # np[0]
 
456
        &shr    ("edx",31);
 
457
        &mov    (&DWP($frame+4,"esp",$j,4),$carry);     # tp[num]=
 
458
        &mov    (&DWP($frame+8,"esp",$j,4),"edx");      # tp[num+1]=
 
459
 
 
460
        &mul    ($word);                                # np[0]*m
 
461
        &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
 
462
        &mov    ($num,$j);
 
463
        &adc    ("edx",0);
 
464
        &mov    ("eax",&DWP(4,$inp));                   # np[1]
 
465
        &mov    ($j,1);
 
466
 
 
467
&set_label("3rdmadd",16);
 
468
        &mov    ($carry,"edx");
 
469
        &mul    ($word);                                # np[j]*m
 
470
        &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
 
471
        &adc    ("edx",0);
 
472
        &add    ($carry,"eax");
 
473
        &mov    ("eax",&DWP(4,$inp,$j,4));              # np[j+1]
 
474
        &adc    ("edx",0);
 
475
        &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j-1]=
 
476
 
 
477
        &mov    ($carry,"edx");
 
478
        &mul    ($word);                                # np[j+1]*m
 
479
        &add    ($carry,&DWP($frame+4,"esp",$j,4));     # +=tp[j+1]
 
480
        &lea    ($j,&DWP(2,$j));
 
481
        &adc    ("edx",0);
 
482
        &add    ($carry,"eax");
 
483
        &mov    ("eax",&DWP(0,$inp,$j,4));              # np[j+2]
 
484
        &adc    ("edx",0);
 
485
        &cmp    ($j,$num);
 
486
        &mov    (&DWP($frame-8,"esp",$j,4),$carry);     # tp[j]=
 
487
        &jl     (&label("3rdmadd"));
 
488
 
 
489
        &mov    ($carry,"edx");
 
490
        &mul    ($word);                                # np[j]*m
 
491
        &add    ($carry,&DWP($frame,"esp",$num,4));     # +=tp[num-1]
 
492
        &adc    ("edx",0);
 
493
        &add    ($carry,"eax");
 
494
        &adc    ("edx",0);
 
495
        &mov    (&DWP($frame-4,"esp",$num,4),$carry);   # tp[num-2]=
 
496
 
 
497
        &mov    ($j,$_bp);                              # i
 
498
        &xor    ("eax","eax");
 
499
        &mov    ($inp,$_ap);
 
500
        &add    ("edx",&DWP($frame+4,"esp",$num,4));    # carry+=tp[num]
 
501
        &adc    ("eax",&DWP($frame+8,"esp",$num,4));    # +=tp[num+1]
 
502
        &mov    (&DWP($frame,"esp",$num,4),"edx");      # tp[num-1]=
 
503
        &cmp    ($j,$num);
 
504
        &mov    (&DWP($frame+4,"esp",$num,4),"eax");    # tp[num]=
 
505
        &je     (&label("common_tail"));
 
506
 
 
507
        &mov    ($word,&DWP(4,$inp,$j,4));              # ap[i]
 
508
        &lea    ($j,&DWP(1,$j));
 
509
        &mov    ("eax",$word);
 
510
        &mov    ($_bp,$j);                              # ++i
 
511
        &mul    ($word);                                # ap[i]*ap[i]
 
512
        &add    ("eax",&DWP($frame,"esp",$j,4));        # +=tp[i]
 
513
        &adc    ("edx",0);
 
514
        &mov    (&DWP($frame,"esp",$j,4),"eax");        # tp[i]=
 
515
        &xor    ($carry,$carry);
 
516
        &cmp    ($j,$num);
 
517
        &lea    ($j,&DWP(1,$j));
 
518
        &je     (&label("sqrlast"));
 
519
 
 
520
        &mov    ($sbit,"edx");                          # zaps $num
 
521
        &shr    ("edx",1);
 
522
        &and    ($sbit,1);
 
523
&set_label("sqradd",16);
 
524
        &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j]
 
525
        &mov    ($carry,"edx");
 
526
        &mul    ($word);                                # ap[j]*ap[i]
 
527
        &add    ("eax",$carry);
 
528
        &lea    ($carry,&DWP(0,"eax","eax"));
 
529
        &adc    ("edx",0);
 
530
        &shr    ("eax",31);
 
531
        &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
 
532
        &lea    ($j,&DWP(1,$j));
 
533
        &adc    ("eax",0);
 
534
        &add    ($carry,$sbit);
 
535
        &adc    ("eax",0);
 
536
        &cmp    ($j,$_num);
 
537
        &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
 
538
        &mov    ($sbit,"eax");
 
539
        &jle    (&label("sqradd"));
 
540
 
 
541
        &mov    ($carry,"edx");
 
542
        &lea    ("edx",&DWP(0,$sbit,"edx",2));
 
543
        &shr    ($carry,31);
 
544
&set_label("sqrlast");
 
545
        &mov    ($word,$_n0);
 
546
        &mov    ($inp,$_np);
 
547
        &imul   ($word,&DWP($frame,"esp"));             # n0*tp[0]
 
548
 
 
549
        &add    ("edx",&DWP($frame,"esp",$j,4));        # +=tp[num]
 
550
        &mov    ("eax",&DWP(0,$inp));                   # np[0]
 
551
        &adc    ($carry,0);
 
552
        &mov    (&DWP($frame,"esp",$j,4),"edx");        # tp[num]=
 
553
        &mov    (&DWP($frame+4,"esp",$j,4),$carry);     # tp[num+1]=
 
554
 
 
555
        &mul    ($word);                                # np[0]*m
 
556
        &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
 
557
        &lea    ($num,&DWP(-1,$j));
 
558
        &adc    ("edx",0);
 
559
        &mov    ($j,1);
 
560
        &mov    ("eax",&DWP(4,$inp));                   # np[1]
 
561
 
 
562
        &jmp    (&label("3rdmadd"));
 
563
}
 
564
 
 
565
&set_label("common_tail",16);
 
566
        &mov    ($np,$_np);                     # load modulus pointer
 
567
        &mov    ($rp,$_rp);                     # load result pointer
 
568
        &lea    ($tp,&DWP($frame,"esp"));       # [$ap and $bp are zapped]
 
569
 
 
570
        &mov    ("eax",&DWP(0,$tp));            # tp[0]
 
571
        &mov    ($j,$num);                      # j=num-1
 
572
        &xor    ($i,$i);                        # i=0 and clear CF!
 
573
 
 
574
&set_label("sub",16);
 
575
        &sbb    ("eax",&DWP(0,$np,$i,4));
 
576
        &mov    (&DWP(0,$rp,$i,4),"eax");       # rp[i]=tp[i]-np[i]
 
577
        &dec    ($j);                           # doesn't affect CF!
 
578
        &mov    ("eax",&DWP(4,$tp,$i,4));       # tp[i+1]
 
579
        &lea    ($i,&DWP(1,$i));                # i++
 
580
        &jge    (&label("sub"));
 
581
 
 
582
        &sbb    ("eax",0);                      # handle upmost overflow bit
 
583
        &and    ($tp,"eax");
 
584
        &not    ("eax");
 
585
        &mov    ($np,$rp);
 
586
        &and    ($np,"eax");
 
587
        &or     ($tp,$np);                      # tp=carry?tp:rp
 
588
 
 
589
&set_label("copy",16);                          # copy or in-place refresh
 
590
        &mov    ("eax",&DWP(0,$tp,$num,4));
 
591
        &mov    (&DWP(0,$rp,$num,4),"eax");     # rp[i]=tp[i]
 
592
        &mov    (&DWP($frame,"esp",$num,4),$j); # zap temporary vector
 
593
        &dec    ($num);
 
594
        &jge    (&label("copy"));
 
595
 
 
596
        &mov    ("esp",$_sp);           # pull saved stack pointer
 
597
        &mov    ("eax",1);
 
598
&set_label("just_leave");
 
599
&function_end("bn_mul_mont");
 
600
 
 
601
&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
 
602
 
 
603
&asm_finish();