2
# ***** BEGIN LICENSE BLOCK *****
3
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
5
# The contents of this file are subject to the Mozilla Public License Version
6
# 1.1 (the "License"); you may not use this file except in compliance with
7
# the License. You may obtain a copy of the License at
8
# http://www.mozilla.org/MPL/
10
# Software distributed under the License is distributed on an "AS IS" basis,
11
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12
# for the specific language governing rights and limitations under the
15
# The Original Code is the Netscape security libraries.
17
# The Initial Developer of the Original Code is
18
# Netscape Communications Corporation.
19
# Portions created by the Initial Developer are Copyright (C) 2000
20
# the Initial Developer. All Rights Reserved.
24
# Alternatively, the contents of this file may be used under the terms of
25
# either the GNU General Public License Version 2 or later (the "GPL"), or
26
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27
# in which case the provisions of the GPL or the LGPL are applicable instead
28
# of those above. If you wish to allow use of your version of this file only
29
# under the terms of either the GPL or the LGPL, and not to allow others to
30
# use your version of this file under the terms of the MPL, indicate your
31
# decision by deleting the provisions above and replace them with the notice
32
# and other provisions required by the GPL or the LGPL. If you do not delete
33
# the provisions above, a recipient may use your version of this file under
34
# the terms of any one of the MPL, the GPL or the LGPL.
36
# ***** END LICENSE BLOCK *****
38
# $Id: mpi_x86_os2.s,v 1.1 2009/06/04 23:53:42 julien.pierre.boogz%sun.com Exp $
44
# -1 means to call _s_mpi_is_sse to determine if we support sse
46
# 0 means to use x86 instructions
47
# 1 means to use sse2 instructions
53
# sigh, handle the difference between -fPIC and not PIC
54
# default to pic, since this file seems to be exclusively
55
# linux right now (solaris uses mpi_i86pc.s and windows uses
60
# movl \var@GOTOFF(%ebx),\reg
63
# movl \reg,\var@GOTOFF(%ebx)
77
# ebp - 36: caller's esi
78
# ebp - 32: caller's edi
86
# ebp + 0: caller's ebp
87
# ebp + 4: return address
89
# ebp + 12: a_len argument
90
# ebp + 16: b argument
91
# ebp + 20: c argument
100
.type _s_mpv_mul_d,@function
117
movl $0,%ebx # carry = 0
118
mov 12(%ebp),%ecx # ecx = a_len
121
je 2f # jmp if a_len == 0
122
mov 8(%ebp),%esi # esi = a
125
lodsl # eax = [ds:esi]; esi += 4
126
mov 16(%ebp),%edx # edx = b
127
mull %edx # edx:eax = Phi:Plo = a_i * b
129
add %ebx,%eax # add carry (%ebx) to edx:eax
131
mov %edx,%ebx # high half of product becomes next carry
133
stosl # [es:edi] = ax; edi += 4;
135
jnz 1b # jmp if a_len != 0
137
mov %ebx,0(%edi) # *c = carry
149
psubq %mm2,%mm2 # carry = 0
150
mov 12(%ebp),%ecx # ecx = a_len
151
movd 16(%ebp),%mm1 # mm1 = b
154
je 6f # jmp if a_len == 0
155
mov 8(%ebp),%esi # esi = a
158
movd 0(%esi),%mm0 # mm0 = *a++
160
pmuludq %mm1,%mm0 # mm0 = b * *a++
161
paddq %mm0,%mm2 # add the carry
162
movd %mm2,0(%edi) # store the 32bit result
164
psrlq $32, %mm2 # save the carry
166
jnz 5b # jmp if a_len != 0
168
movd %mm2,0(%edi) # *c = carry
176
# ebp - 36: caller's esi
177
# ebp - 32: caller's edi
185
# ebp + 0: caller's ebp
186
# ebp + 4: return address
187
# ebp + 8: a argument
188
# ebp + 12: a_len argument
189
# ebp + 16: b argument
190
# ebp + 20: c argument
198
.globl _s_mpv_mul_d_add
199
.type _s_mpv_mul_d_add,@function
203
je _s_mpv_mul_d_add_x86
204
jg _s_mpv_mul_d_add_sse2
208
jg _s_mpv_mul_d_add_sse2
209
_s_mpv_mul_d_add_x86:
216
movl $0,%ebx # carry = 0
217
mov 12(%ebp),%ecx # ecx = a_len
220
je 11f # jmp if a_len == 0
221
mov 8(%ebp),%esi # esi = a
224
lodsl # eax = [ds:esi]; esi += 4
225
mov 16(%ebp),%edx # edx = b
226
mull %edx # edx:eax = Phi:Plo = a_i * b
228
add %ebx,%eax # add carry (%ebx) to edx:eax
230
mov 0(%edi),%ebx # add in current word from *c
233
mov %edx,%ebx # high half of product becomes next carry
235
stosl # [es:edi] = ax; edi += 4;
237
jnz 10b # jmp if a_len != 0
239
mov %ebx,0(%edi) # *c = carry
246
_s_mpv_mul_d_add_sse2:
251
psubq %mm2,%mm2 # carry = 0
252
mov 12(%ebp),%ecx # ecx = a_len
253
movd 16(%ebp),%mm1 # mm1 = b
256
je 16f # jmp if a_len == 0
257
mov 8(%ebp),%esi # esi = a
260
movd 0(%esi),%mm0 # mm0 = *a++
262
pmuludq %mm1,%mm0 # mm0 = b * *a++
263
paddq %mm0,%mm2 # add the carry
265
paddq %mm0,%mm2 # add the carry
266
movd %mm2,0(%edi) # store the 32bit result
268
psrlq $32, %mm2 # save the carry
270
jnz 15b # jmp if a_len != 0
272
movd %mm2,0(%edi) # *c = carry
280
# ebp - 8: caller's esi
281
# ebp - 4: caller's edi
282
# ebp + 0: caller's ebp
283
# ebp + 4: return address
284
# ebp + 8: a argument
285
# ebp + 12: a_len argument
286
# ebp + 16: b argument
287
# ebp + 20: c argument
295
.globl _s_mpv_mul_d_add_prop
296
.type _s_mpv_mul_d_add_prop,@function
297
_s_mpv_mul_d_add_prop:
300
je _s_mpv_mul_d_add_prop_x86
301
jg _s_mpv_mul_d_add_prop_sse2
305
jg _s_mpv_mul_d_add_prop_sse2
306
_s_mpv_mul_d_add_prop_x86:
313
movl $0,%ebx # carry = 0
314
mov 12(%ebp),%ecx # ecx = a_len
317
je 21f # jmp if a_len == 0
319
mov 8(%ebp),%esi # esi = a
321
lodsl # eax = [ds:esi]; esi += 4
322
mov 16(%ebp),%edx # edx = b
323
mull %edx # edx:eax = Phi:Plo = a_i * b
325
add %ebx,%eax # add carry (%ebx) to edx:eax
327
mov 0(%edi),%ebx # add in current word from *c
330
mov %edx,%ebx # high half of product becomes next carry
332
stosl # [es:edi] = ax; edi += 4;
334
jnz 20b # jmp if a_len != 0
336
cmp $0,%ebx # is carry zero?
338
mov 0(%edi),%eax # add in current word from *c
340
stosl # [es:edi] = ax; edi += 4;
343
mov 0(%edi),%eax # add in current word from *c
345
stosl # [es:edi] = ax; edi += 4;
354
_s_mpv_mul_d_add_prop_sse2:
360
psubq %mm2,%mm2 # carry = 0
361
mov 12(%ebp),%ecx # ecx = a_len
362
movd 16(%ebp),%mm1 # mm1 = b
365
je 26f # jmp if a_len == 0
366
mov 8(%ebp),%esi # esi = a
369
movd 0(%esi),%mm0 # mm0 = *a++
370
movd 0(%edi),%mm3 # fetch the sum
372
pmuludq %mm1,%mm0 # mm0 = b * *a++
373
paddq %mm0,%mm2 # add the carry
374
paddq %mm3,%mm2 # add *c++
375
movd %mm2,0(%edi) # store the 32bit result
377
psrlq $32, %mm2 # save the carry
379
jnz 25b # jmp if a_len != 0
382
cmp $0,%ebx # is carry zero?
389
mov 0(%edi),%eax # add in current word from *c
391
stosl # [es:edi] = ax; edi += 4;
403
# ebp - 20: caller's esi
404
# ebp - 16: caller's edi
407
# ebp - 4: a_len local
408
# ebp + 0: caller's ebp
409
# ebp + 4: return address
410
# ebp + 8: pa argument
411
# ebp + 12: a_len argument
412
# ebp + 16: ps argument
422
.globl _s_mpv_sqr_add_prop
423
.type _s_mpv_sqr_add_prop,@function
427
je _s_mpv_sqr_add_prop_x86
428
jg _s_mpv_sqr_add_prop_sse2
432
jg _s_mpv_sqr_add_prop_sse2
433
_s_mpv_sqr_add_prop_x86:
440
movl $0,%ebx # carry = 0
441
mov 12(%ebp),%ecx # a_len
442
mov 16(%ebp),%edi # edi = ps
444
je 31f # jump if a_len == 0
446
mov 8(%ebp),%esi # esi = pa
448
lodsl # %eax = [ds:si]; si += 4;
451
add %ebx,%eax # add "carry"
454
add %ebx,%eax # add low word from result
456
stosl # [es:di] = %eax; di += 4;
457
adc %ebx,%edx # add high word from result
461
stosl # [es:di] = %eax; di += 4;
463
jnz 30b # jmp if a_len != 0
465
cmp $0,%ebx # is carry zero?
467
mov 0(%edi),%eax # add in current word from *c
469
stosl # [es:edi] = ax; edi += 4;
472
mov 0(%edi),%eax # add in current word from *c
474
stosl # [es:edi] = ax; edi += 4;
483
_s_mpv_sqr_add_prop_sse2:
489
psubq %mm2,%mm2 # carry = 0
490
mov 12(%ebp),%ecx # ecx = a_len
493
je 36f # jmp if a_len == 0
494
mov 8(%ebp),%esi # esi = a
497
movd 0(%esi),%mm0 # mm0 = *a
498
movd 0(%edi),%mm3 # fetch the sum
500
pmuludq %mm0,%mm0 # mm0 = sqr(a)
501
paddq %mm0,%mm2 # add the carry
502
paddq %mm3,%mm2 # add the low word
504
movd %mm2,0(%edi) # store the 32bit result
506
paddq %mm3,%mm2 # add the high word
507
movd %mm2,4(%edi) # store the 32bit result
508
psrlq $32, %mm2 # save the carry.
511
jnz 35b # jmp if a_len != 0
514
cmp $0,%ebx # is carry zero?
521
mov 0(%edi),%eax # add in current word from *c
523
stosl # [es:edi] = ax; edi += 4;
535
# Divide 64-bit (Nhi,Nlo) by 32-bit divisor, which must be normalized
536
# so its high bit is 1. This code is from NSPR.
538
# mp_err _s_mpv_div_2dx1d(mp_digit Nhi, mp_digit Nlo, mp_digit divisor,
539
# mp_digit *qp, mp_digit *rp)
541
# esp + 0: Caller's ebx
542
# esp + 4: return address
543
# esp + 8: Nhi argument
544
# esp + 12: Nlo argument
545
# esp + 16: divisor argument
546
# esp + 20: qp argument
547
# esp + 24: rp argument
557
.globl _s_mpv_div_2dx1d
558
.type _s_mpv_div_2dx1d,@function
569
xor %eax,%eax # return zero