1
; HP-PA 2.0 64-bit __gmpn_submul_1 -- Multiply a limb vector with a limb and
2
; subtract the result from a second limb vector.
4
; Copyright 1998, 1999, 2000 Free Software Foundation, Inc.
6
; This file is part of the GNU MP Library.
8
; The GNU MP Library is free software; you can redistribute it and/or modify
9
; it under the terms of the GNU Lesser General Public License as published by
10
; the Free Software Foundation; either version 2.1 of the License, or (at your
11
; option) any later version.
13
; The GNU MP Library is distributed in the hope that it will be useful, but
14
; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15
; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16
; License for more details.
18
; You should have received a copy of the GNU Lesser General Public License
19
; along with the GNU MP Library; see the file COPYING.LIB. If not, write to
20
; the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
27
#define s2limb -56(%r30)
29
; This runs at 11 cycles/limb on a PA8000. It might be possible to make
30
; it faster, but the PA8000 pipeline is not publically documented and it
31
; is very complex to reverse engineer
46
.export __gmpn_submul_1,entry
49
.callinfo frame=128,no_calls
51
fldd -56(%r30),%fr5 ; s2limb passed on stack
53
add %r0,%r0,cylimb ; clear cy and cylimb
64
xmpyu %fr5R,%fr4R,%fr6
66
xmpyu %fr5R,%fr4L,%fr7
68
xmpyu %fr5L,%fr4R,%fr8
70
xmpyu %fr5L,%fr4L,%fr9
72
ldd -128(%r30),lo ; lo = low 64 bit of product
73
ldd -120(%r30),m0 ; m0 = mid0 64 bit of product
74
ldd -112(%r30),m1 ; m1 = mid1 64 bit of product
75
ldd -104(%r30),hi ; hi = high 64 bit of product
76
addib,= -1,%r24,L$end1
80
addib,= -1,%r24,L$end2
83
xmpyu %fr5R,%fr4R,%fr6
85
xmpyu %fr5R,%fr4L,%fr7
87
xmpyu %fr5L,%fr4R,%fr8
89
xmpyu %fr5L,%fr4L,%fr9
92
extrd,u lo,31,32,t1 ; t1 = hi32(lo)
93
extrd,u lo,63,32,t4 ; t4 = lo32(lo)
94
add,l m0,t1,t1 ; t1 += m0
95
add,l,*nuv m1,t1,t1 ; t1 += m1
96
add,l %r5,hi,hi ; propagate carry
97
extrd,u t1,31,32,t2 ; t2 = hi32(t1)
98
depd,z t1,31,32,t5 ; t5 = lo32(t1)
99
add,l t5,t4,t4 ; t4 += lo32(t1)
100
ldd -128(%r30),lo ; lo = low 64 bit of product
102
ldd -120(%r30),m0 ; m0 = mid0 64 bit of product
104
ldd -112(%r30),m1 ; m1 = mid1 64 bit of product
107
ldd -104(%r30),hi ; hi = high 64 bit of product
108
add,dc %r0,cylimb,cylimb
112
addib,<> -1,%r24,L$loop
115
xmpyu %fr5R,%fr4R,%fr6
117
xmpyu %fr5R,%fr4L,%fr7
119
xmpyu %fr5L,%fr4R,%fr8
121
xmpyu %fr5L,%fr4L,%fr9
124
extrd,u lo,31,32,t1 ; t1 = hi32(lo)
125
extrd,u lo,63,32,t4 ; t4 = lo32(lo)
126
add,l m0,t1,t1 ; t1 += m0
127
add,l,*nuv m1,t1,t1 ; t1 += m0
128
add,l %r5,hi,hi ; propagate carry
129
extrd,u t1,31,32,t2 ; t2 = hi32(t1)
130
depd,z t1,31,32,t5 ; t5 = lo32(t1)
131
add,l t5,t4,t4 ; t4 += lo32(t1)
132
ldd -128(%r30),lo ; lo = low 64 bit of product
134
ldd -120(%r30),m0 ; m0 = mid0 64 bit of product
136
ldd -112(%r30),m1 ; m1 = mid1 64 bit of product
139
ldd -104(%r30),hi ; hi = high 64 bit of product
140
add,dc %r0,cylimb,cylimb
145
extrd,u lo,31,32,t1 ; t1 = hi32(lo)
146
extrd,u lo,63,32,t4 ; t4 = lo32(lo)
147
add,l m0,t1,t1 ; t1 += m0
148
add,l,*nuv m1,t1,t1 ; t1 += m0
149
add,l %r5,hi,hi ; propagate carry
150
extrd,u t1,31,32,t2 ; t2 = hi32(t1)
151
depd,z t1,31,32,t5 ; t5 = lo32(t1)
152
add,l t5,t4,t4 ; t4 += lo32(t1)
157
add,dc %r0,cylimb,cylimb
166
extrd,u cylimb,31,32,%r28