1
; The contents of this file are subject to the Mozilla Public
2
; License Version 1.1 (the "License"); you may not use this file
3
; except in compliance with the License. You may obtain a copy of
4
; the License at http://www.mozilla.org/MPL/
6
; Software distributed under the License is distributed on an "AS
7
; IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
8
; implied. See the License for the specific language governing
9
; rights and limitations under the License.
11
; The Original Code is MAXPY multiple-precision integer arithmetic.
13
; The Initial Developer of the Original Code is the Hewlett-Packard Company.
14
; Portions created by Hewlett-Packard Company are
15
; Copyright (C) 1997 Hewlett-Packard Company. All Rights Reserved.
18
; coded by: William B. Ackerman
20
; Alternatively, the contents of this file may be used under the
21
; terms of the GNU General Public License Version 2 or later (the
22
; "GPL"), in which case the provisions of the GPL are applicable
23
; instead of those above. If you wish to allow use of your
24
; version of this file only under the terms of the GPL and not to
25
; allow others to use your version of this file under the MPL,
26
; indicate your decision by deleting the provisions above and
27
; replace them with the notice and other provisions required by
28
; the GPL. If you do not delete the provisions above, a recipient
29
; may use your version of this file under either the MPL or the
40
.SUBSPA $CODE$,QUAD=0,ALIGN=4,ACCESS=0x2c,CODE_ONLY,SORT=24
42
; ***************************************************************
46
; ***************************************************************
48
; There is no default -- you must specify one or the other.
49
#define LITTLE_WORDIAN 1
56
#define UN_SIXTEEN -16
57
#define UN_TWENTY_FOUR -24
63
#define THIRTY_TWO -32
66
#define UN_TWENTY_FOUR 24
69
; This performs a multiple-precision integer version of "daxpy",
70
; Using the selected addressing direction. "Little-wordian" means that
71
; the least significant word of a number is stored at the lowest address.
72
; "Big-wordian" means that the most significant word is at the lowest
73
; address. Either way, the incoming address of the vector is that
74
; of the least significant word. That means that, for little-wordian
75
; addressing, we move the address upward as we propagate carries
76
; from the least significant word to the most significant. For
77
; big-wordian we move the address downward.
79
; We use the following registers:
81
; r2 return PC, of course
83
; r25 = arg2 = address of scalar
84
; r24 = arg3 = multiplicand vector
85
; r23 = arg4 = result vector
87
; fr9 = scalar loaded once only from r25
89
; The cycle counts shown in the bodies below are simply the result of a
90
; scheduling by hand. The actual PCX-U hardware does it differently.
91
; The intention is that the overall speed is the same.
93
; The pipeline startup and shutdown code is constructed in the usual way,
94
; by taking the loop bodies and removing unnecessary instructions.
95
; We have left the comments describing cycle numbers in the code.
96
; These are intended for reference when comparing with the main loop,
97
; and have no particular relationship to actual cycle numbers.
105
.CALLINFO FRAME=120,ENTRY_GR=%r4
108
; Of course, real men don't use the sissy "enter" and "leave" commands.
109
; They write their own stack manipulation stuff. Unfortunately,
110
; that doesn't generate complete unwind info, whereas "enter" and
111
; "leave" (if the documentation is to be believed) do so. Therefore,
112
; we use the sissy commands. We have verified (by real-man methods)
113
; that the above command generates what we want:
114
; STW,MA %r3,128(%sp)
117
ADDIB,< -1,%r26,$L0 ; If N = 0, exit immediately.
118
FLDD 0(%r25),%fr9 ; fr9 = scalar
122
FLDD 0(%r24),%fr24 ; Cycle 1
123
XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3
124
XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4
125
XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5
126
CMPIB,> 3,%r26,$N_IS_SMALL ; Pick out cases N = 1, 2, or 3
127
XMPYU %fr9L,%fr24R,%fr24 ; Cycle 6
128
FLDD EIGHT(%r24),%fr28 ; Cycle 8
129
XMPYU %fr9L,%fr28R,%fr31 ; Cycle 10
131
XMPYU %fr9R,%fr28L,%fr30 ; Cycle 11
133
LDO SIXTEEN(%r24),%r24 ; Cycle 12
135
XMPYU %fr9R,%fr28R,%fr29 ; Cycle 13
140
XMPYU %fr9L,%fr28L,%fr28 ; Cycle 1
144
FSTD %fr26,-88(%sp) ; Cycle 2
146
XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3
149
XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4
153
XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5
157
XMPYU %fr9L,%fr24R,%fr24 ; Cycle 6
161
ADD,DC %r20,%r19,%r19 ; Cycle 7
163
SHRPD %r3,%r0,32,%r21
166
FLDD EIGHT(%r24),%fr28 ; Cycle 8
169
SHRPD %r19,%r3,32,%r3
171
LDD -72(%sp),%r29 ; Cycle 9
172
SHRPD %r20,%r19,32,%r20
175
XMPYU %fr9L,%fr28R,%fr31 ; Cycle 10
179
XMPYU %fr9R,%fr28L,%fr30 ; Cycle 11
184
LDO SIXTEEN(%r24),%r24 ; Cycle 12
187
XMPYU %fr9R,%fr28R,%fr29 ; Cycle 13
188
ADD %r0,%r0,%r0 ; clear the carry bit
189
ADDIB,<= -4,%r26,$ENDLOOP ; actually happens in cycle 12
191
; MFCTL %cr16,%r21 ; for timing
196
$LOOP XMPYU %fr9L,%fr28L,%fr28 ; Cycle 1
201
LDO SIXTEEN(%r23),%r23 ; Cycle 2
205
XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3
208
LDD UN_EIGHT(%r23),%r21
210
XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4
215
XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5
216
ADD,DC %r20,%r31,%r22
220
XMPYU %fr9L,%fr24R,%fr24 ; Cycle 6
223
STD %r1,UN_SIXTEEN(%r23)
225
ADD,DC %r20,%r19,%r19 ; Cycle 7
226
SHRPD %r3,%r0,32,%r21
230
ADD,DC %r0,%r0,%r20 ; Cycle 8
231
SHRPD %r19,%r3,32,%r3
232
FLDD EIGHT(%r24),%fr28
235
SHRPD %r20,%r19,32,%r20 ; Cycle 9
237
STD %r28,UN_EIGHT(%r23)
240
XMPYU %fr9L,%fr28R,%fr31 ; Cycle 10
244
XMPYU %fr9R,%fr28L,%fr30 ; Cycle 11
249
LDO SIXTEEN(%r24),%r24 ; Cycle 12
252
XMPYU %fr9R,%fr28R,%fr29 ; Cycle 13
254
ADDIB,> -2,%r26,$LOOP ; actually happens in cycle 12
259
; Shutdown code, first stage.
261
; MFCTL %cr16,%r21 ; for timing
262
; STD %r21,UN_SIXTEEN(%r23)
264
; STD %r21,UN_EIGHT(%r23)
266
XMPYU %fr9L,%fr28L,%fr28 ; Cycle 1
268
CMPIB,= 0,%r26,$ONEMORE
271
LDO SIXTEEN(%r23),%r23 ; Cycle 2
275
ADD %r3,%r1,%r1 ; Cycle 3
277
LDD UN_EIGHT(%r23),%r21
279
ADD,DC %r21,%r4,%r28 ; Cycle 4
281
STD %r28,UN_EIGHT(%r23) ; moved up from cycle 9
284
ADD,DC %r20,%r31,%r22 ; Cycle 5
285
STD %r1,UN_SIXTEEN(%r23)
290
ADD %r21,%r3,%r3 ; Cycle 6
293
ADD,DC %r20,%r19,%r19 ; Cycle 7
294
SHRPD %r3,%r0,32,%r21
298
ADD,DC %r0,%r0,%r20 ; Cycle 8
299
SHRPD %r19,%r3,32,%r3
302
SHRPD %r20,%r19,32,%r20 ; Cycle 9
306
ADD,DC %r3,%r4,%r4 ; Cycle 10
308
ADD,DC %r0,%r20,%r20 ; Cycle 11
311
ADD %r22,%r1,%r1 ; Cycle 13
313
; Shutdown code, second stage.
315
ADD,DC %r29,%r4,%r4 ; Cycle 1
317
LDO SIXTEEN(%r23),%r23 ; Cycle 2
320
LDD UN_EIGHT(%r23),%r21 ; Cycle 3
323
ADD,DC %r21,%r4,%r28 ; Cycle 4
325
ADD,DC %r20,%r31,%r22 ; Cycle 5
327
STD %r1,UN_SIXTEEN(%r23); Cycle 6
329
STD %r28,UN_EIGHT(%r23) ; Cycle 9
331
LDD 0(%r23),%r3 ; Cycle 11
333
; Shutdown code, third stage.
335
LDO SIXTEEN(%r23),%r23
337
$JOIN1 ADD,DC %r0,%r0,%r21
338
CMPIB,*= 0,%r21,$L0 ; if no overflow, exit
339
STD %r1,UN_SIXTEEN(%r23)
341
; Final carry propagation
343
$FINAL1 LDO EIGHT(%r23),%r23
344
LDD UN_SIXTEEN(%r23),%r21
346
CMPIB,*= 0,%r21,$FINAL1 ; Keep looping if there is a carry.
347
STD %r21,UN_SIXTEEN(%r23)
351
; Here is the code that handles the difficult cases N=1, N=2, and N=3.
352
; We do the usual trick -- branch out of the startup code at appropriate
353
; points, and branch into the shutdown code.
356
CMPIB,= 0,%r26,$N_IS_ONE
357
FSTD %fr24,-96(%sp) ; Cycle 10
358
FLDD EIGHT(%r24),%fr28 ; Cycle 8
359
XMPYU %fr9L,%fr28R,%fr31 ; Cycle 10
360
XMPYU %fr9R,%fr28L,%fr30 ; Cycle 11
362
FSTD %fr31,-64(%sp) ; Cycle 12
363
XMPYU %fr9R,%fr28R,%fr29 ; Cycle 13
365
XMPYU %fr9L,%fr28L,%fr28 ; Cycle 1
366
CMPIB,= 2,%r26,$N_IS_THREE
370
FSTD %fr26,-88(%sp) ; Cycle 2
371
FSTD %fr28,-104(%sp) ; Cycle 3
372
LDD -96(%sp),%r3 ; Cycle 4
378
FLDD SIXTEEN(%r24),%fr24
379
FSTD %fr26,-88(%sp) ; Cycle 2
380
XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3
382
XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4
385
XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5
394
FSTD %fr26,-88(%sp) ; Cycle 2
398
; We came out of the unrolled loop with wrong parity. Do one more
399
; single cycle. This is quite tricky, because of the way the
400
; carry chains and SHRPD chains have been chopped up.
406
LDO SIXTEEN(%r23),%r23 ; Cycle 2
410
XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3
412
LDD UN_EIGHT(%r23),%r21
415
XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4
417
STD %r28,UN_EIGHT(%r23) ; moved from cycle 9
421
XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5
422
ADD,DC %r20,%r31,%r22
426
STD %r1,UN_SIXTEEN(%r23); Cycle 6
428
XMPYU %fr9L,%fr24R,%fr24
432
ADD,DC %r20,%r19,%r19 ; Cycle 7
434
SHRPD %r3,%r0,32,%r21
437
LDD -104(%sp),%r31 ; Cycle 8
439
SHRPD %r19,%r3,32,%r3
441
LDD -72(%sp),%r29 ; Cycle 9
442
SHRPD %r20,%r19,32,%r20
445
ADD,DC %r3,%r4,%r4 ; Cycle 10
448
ADD,DC %r0,%r20,%r20 ; Cycle 11
452
ADD %r22,%r1,%r1 ; Cycle 13
455
; Shutdown code, stage 1-1/2.
457
ADD,DC %r29,%r4,%r4 ; Cycle 1
459
LDO SIXTEEN(%r23),%r23 ; Cycle 2
463
LDD UN_EIGHT(%r23),%r21 ; Cycle 3
466
ADD,DC %r21,%r4,%r28 ; Cycle 4
467
STD %r28,UN_EIGHT(%r23) ; moved from cycle 9
469
ADD,DC %r20,%r31,%r22 ; Cycle 5
470
STD %r1,UN_SIXTEEN(%r23)
472
LDD -96(%sp),%r3 ; moved from cycle 4
474
ADD %r21,%r3,%r3 ; Cycle 6
475
ADD,DC %r0,%r0,%r19 ; Cycle 7
477
SHRPD %r3,%r0,32,%r21
479
SHRPD %r19,%r3,32,%r3 ; Cycle 8
480
ADD %r21,%r1,%r1 ; Cycle 9
481
ADD,DC %r3,%r4,%r4 ; Cycle 10
482
LDD 0(%r23),%r3 ; Cycle 11
483
ADD %r22,%r1,%r1 ; Cycle 13
485
; Shutdown code, stage 2-1/2.
487
ADD,DC %r0,%r4,%r4 ; Cycle 1
488
LDO SIXTEEN(%r23),%r23 ; Cycle 2
489
LDD UN_EIGHT(%r23),%r21 ; Cycle 3
491
STD %r1,UN_SIXTEEN(%r23)
501
; We have verified that the above command generates what we want:
504
; LDW,MB -128(%sp),%r3
508
; ***************************************************************
510
; add_diag_[little/big]
512
; ***************************************************************
514
; The arguments are as follows:
515
; r2 return PC, of course
516
; r26 = arg1 = length
517
; r25 = arg2 = vector to square
518
; r24 = arg3 = result vector
520
#ifdef LITTLE_WORDIAN
526
.CALLINFO FRAME=120,ENTRY_GR=%r4
529
ADDIB,< -1,%r26,$Z0 ; If N=0, exit immediately.
534
FLDD 0(%r25),%fr7 ; Cycle 2 (alternate body)
535
XMPYU %fr7R,%fr7R,%fr29 ; Cycle 4
536
XMPYU %fr7L,%fr7R,%fr27 ; Cycle 5
537
XMPYU %fr7L,%fr7L,%fr30
538
LDO SIXTEEN(%r25),%r25 ; Cycle 6
540
FSTD %fr27,-72(%sp) ; Cycle 7
541
CMPIB,= 0,%r26,$DIAG_N_IS_ONE ; Cycle 1 (main body)
543
FLDD UN_EIGHT(%r25),%fr7 ; Cycle 2
544
LDD -88(%sp),%r22 ; Cycle 3
545
LDD -72(%sp),%r31 ; Cycle 4
546
XMPYU %fr7R,%fr7R,%fr28
547
XMPYU %fr7L,%fr7R,%fr24 ; Cycle 5
548
XMPYU %fr7L,%fr7L,%fr31
549
LDD -96(%sp),%r20 ; Cycle 6
551
ADD %r0,%r0,%r0 ; clear the carry bit
552
ADDIB,<= -2,%r26,$ENDDIAGLOOP ; Cycle 7
555
; Here is the loop. It is unrolled twice, modelled after the "alternate body" and then the "main body".
558
SHRPD %r31,%r0,31,%r3 ; Cycle 1 (alternate body)
559
LDO SIXTEEN(%r25),%r25
562
SHRPD %r0,%r31,31,%r4 ; Cycle 2
564
FLDD UN_SIXTEEN(%r25),%fr7
565
ADD,DC %r0,%r20,%r20 ; Cycle 3
567
XMPYU %fr7R,%fr7R,%fr29 ; Cycle 4
570
XMPYU %fr7L,%fr7R,%fr27 ; Cycle 5
571
XMPYU %fr7L,%fr7L,%fr30
574
ADD,DC %r4,%r20,%r20 ; Cycle 6
577
ADD %r20,%r1,%r1 ; Cycle 7
579
SHRPD %r29,%r0,31,%r4 ; Cycle 1 (main body)
580
LDO THIRTY_TWO(%r24),%r24
581
LDD UN_SIXTEEN(%r24),%r28
583
SHRPD %r0,%r29,31,%r3 ; Cycle 2
585
FLDD UN_EIGHT(%r25),%fr7
586
STD %r1,UN_TWENTY_FOUR(%r24)
587
ADD,DC %r0,%r19,%r19 ; Cycle 3
589
XMPYU %fr7R,%fr7R,%fr28 ; Cycle 4
591
STD %r4,UN_SIXTEEN(%r24)
592
XMPYU %fr7L,%fr7R,%fr24 ; Cycle 5
593
XMPYU %fr7L,%fr7L,%fr31
595
LDD UN_EIGHT(%r24),%r28
596
ADD,DC %r3,%r19,%r19 ; Cycle 6
599
ADD %r19,%r28,%r28 ; Cycle 7
601
ADDIB,> -2,%r26,$DIAGLOOP ; Cycle 8
602
STD %r28,UN_EIGHT(%r24)
607
CMPIB,= 0,%r26,$ONEMOREDIAG
608
SHRPD %r31,%r0,31,%r3
610
; Shutdown code, first stage.
612
FSTD %fr31,-104(%sp) ; Cycle 1 (alternate body)
614
SHRPD %r0,%r31,31,%r4 ; Cycle 2
616
ADD,DC %r0,%r20,%r20 ; Cycle 3
619
LDD -64(%sp),%r29 ; Cycle 4
621
LDD EIGHT(%r24),%r1 ; Cycle 5
622
LDO SIXTEEN(%r25),%r25 ; Cycle 6
625
ADD %r20,%r1,%r1 ; Cycle 7
626
ADD,DC %r0,%r21,%r21 ; Cycle 8
629
; Shutdown code, second stage.
631
SHRPD %r29,%r0,31,%r4 ; Cycle 1 (main body)
632
LDO THIRTY_TWO(%r24),%r24
633
LDD UN_SIXTEEN(%r24),%r1
634
SHRPD %r0,%r29,31,%r3 ; Cycle 2
636
ADD,DC %r0,%r19,%r19 ; Cycle 3
638
STD %r4,UN_SIXTEEN(%r24); Cycle 4
639
LDD UN_EIGHT(%r24),%r28 ; Cycle 5
640
ADD,DC %r3,%r19,%r19 ; Cycle 6
641
ADD %r19,%r28,%r28 ; Cycle 7
642
ADD,DC %r0,%r0,%r22 ; Cycle 8
643
CMPIB,*= 0,%r22,$Z0 ; if no overflow, exit
644
STD %r28,UN_EIGHT(%r24)
646
; Final carry propagation
650
LDD UN_EIGHT(%r24),%r26
652
CMPIB,*= 0,%r26,$FDIAG2 ; Keep looping if there is a carry.
653
STD %r26,UN_EIGHT(%r24)
658
; Here is the code that handles the difficult case N=1.
659
; We do the usual trick -- branch out of the startup code at appropriate
660
; points, and branch into the shutdown code.
669
; We came out of the unrolled loop with wrong parity. Do one more
670
; single cycle. This is the "alternate body". It will, of course,
671
; give us opposite registers from the other case, so we need
672
; completely different shutdown code.
675
FSTD %fr31,-104(%sp) ; Cycle 1 (alternate body)
677
FLDD 0(%r25),%fr7 ; Cycle 2
678
SHRPD %r0,%r31,31,%r4
680
ADD,DC %r0,%r20,%r20 ; Cycle 3
683
LDD -64(%sp),%r29 ; Cycle 4
685
XMPYU %fr7R,%fr7R,%fr29
686
LDD EIGHT(%r24),%r1 ; Cycle 5
687
XMPYU %fr7L,%fr7R,%fr27
688
XMPYU %fr7L,%fr7L,%fr30
689
LDD -104(%sp),%r19 ; Cycle 6
692
FSTD %fr27,-72(%sp) ; Cycle 7
694
ADD,DC %r0,%r21,%r21 ; Cycle 8
697
; Shutdown code, first stage.
699
SHRPD %r29,%r0,31,%r4 ; Cycle 1 (main body)
700
LDO THIRTY_TWO(%r24),%r24
702
LDD UN_SIXTEEN(%r24),%r1
703
SHRPD %r0,%r29,31,%r3 ; Cycle 2
705
ADD,DC %r0,%r19,%r19 ; Cycle 3
708
LDD -72(%sp),%r31 ; Cycle 4
709
STD %r4,UN_SIXTEEN(%r24)
710
LDD UN_EIGHT(%r24),%r28 ; Cycle 5
711
LDD -96(%sp),%r20 ; Cycle 6
713
ADD %r19,%r28,%r28 ; Cycle 7
714
ADD,DC %r0,%r22,%r22 ; Cycle 8
715
STD %r28,UN_EIGHT(%r24)
717
; Shutdown code, second stage.
720
SHRPD %r31,%r0,31,%r3 ; Cycle 1 (alternate body)
722
SHRPD %r0,%r31,31,%r4 ; Cycle 2
724
ADD,DC %r0,%r20,%r20 ; Cycle 3
726
STD %r3,0(%r24) ; Cycle 4
727
LDD EIGHT(%r24),%r1 ; Cycle 5
729
ADD %r20,%r1,%r1 ; Cycle 7
730
ADD,DC %r0,%r0,%r21 ; Cycle 8
731
CMPIB,*= 0,%r21,$Z0 ; if no overflow, exit
734
; Final carry propagation
740
CMPIB,*= 0,%r26,$FDIAG1 ; Keep looping if there is a carry.
750
#ifdef LITTLE_WORDIAN
751
.EXPORT maxpy_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,LONG_RETURN
752
.EXPORT add_diag_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,LONG_RETURN
754
.EXPORT maxpy_big,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,LONG_RETURN
755
.EXPORT add_diag_big,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,LONG_RETURN
760
; How to use "maxpy_PA20_little" and "maxpy_PA20_big"
762
; The routine "maxpy_PA20_little" or "maxpy_PA20_big"
763
; performs a 64-bit x any-size multiply, and adds the
764
; result to an area of memory. That is, it performs
772
; and then adds the "PQRST" vector into an area of memory,
773
; handling all carries.
775
; Digression on nomenclature and endian-ness:
777
; Each of the capital letters in the above represents a 64-bit
778
; quantity. That is, you could think of the discussion as
779
; being in terms of radix-16-quintillion arithmetic. The data
780
; type being manipulated is "unsigned long long int". This
781
; requires the 64-bit extension of the HP-UX C compiler,
782
; available at release 10. You need these compiler flags to
783
; enable these extensions:
785
; -Aa +e +DA2.0 +DS2.0
787
; (The first specifies ANSI C, the second enables the
788
; extensions, which are beyond ANSI C, and the third and
789
; fourth tell the compiler to use whatever features of the
790
; PA2.0 architecture it wishes, in order to made the code more
791
; efficient. Since the presence of the assembly code will
792
; make the program unable to run on anything less than PA2.0,
793
; you might as well gain the performance enhancements in the C
796
; Questions of "endian-ness" often come up, usually in the
797
; context of byte ordering in a word. These routines have a
798
; similar issue, that could be called "wordian-ness".
799
; Independent of byte ordering (PA is always big-endian), one
800
; can make two choices when representing extremely large
801
; numbers as arrays of 64-bit doublewords in memory.
803
; "Little-wordian" layout means that the least significant
804
; word of a number is stored at the lowest address.
813
; | | |____ address 0
815
; | |_______address 8
819
; "Big-wordian" means that the most significant word is at the
829
; | | |____ address 32
831
; | |_______address 24
835
; When you compile the file, you must specify one or the other, with
836
; a switch "-DLITTLE_WORDIAN" or "-DBIG_WORDIAN".
838
; Incidentally, you assemble this file as part of your
839
; project with the same C compiler as the rest of the program.
840
; My "makefile" for a superprecision arithmetic package has
841
; the following stuff:
844
; CC = cc -Aa +e -z +DA2.0 +DS2.0 +w1
846
; LDFLAGS = -L /usr/lib -Wl,-aarchive
848
; # general build rule for ".s" files:
850
; $(CC) $(CFLAGS) -c $< -DBIG_WORDIAN
852
; # Now any bind step that calls for pa20.o will assemble pa20.s
854
; End of digression, back to arithmetic:
856
; The way we multiply two huge numbers is, of course, to multiply
857
; the "ABCD" vector by each of the "WXYZ" doublewords, adding
858
; the result vectors with increasing offsets, the way we learned
859
; in school, back before we all used calculators:
871
; So we call maxpy_PA20_big (in my case; my package is
872
; big-wordian) repeatedly, giving the W, X, Y, and Z arguments
873
; in turn as the "scalar", and giving the "ABCD" vector each
874
; time. We direct it to add its result into an area of memory
875
; that we have cleared at the start. We skew the exact
876
; location into that area with each call.
878
; The prototype for the function is
880
; extern void maxpy_PA20_big(
881
; int length, /* Number of doublewords in the multiplicand vector. */
882
; const long long int *scalaraddr, /* Address to fetch the scalar. */
883
; const long long int *multiplicand, /* The multiplicand vector. */
884
; long long int *result); /* Where to accumulate the result. */
886
; (You should place a copy of this prototype in an include file
887
; or in your C file.)
889
; Now, IN ALL CASES, the given address for the multiplicand or
890
; the result is that of the LEAST SIGNIFICANT DOUBLEWORD.
891
; That word is, of course, the word at which the routine
892
; starts processing. "maxpy_PA20_little" then increases the
893
; addresses as it computes. "maxpy_PA20_big" decreases them.
895
; In our example above, "length" would be 4 in each case.
896
; "multiplicand" would be the "ABCD" vector. Specifically,
897
; the address of the element "D". "scalaraddr" would be the
898
; address of "W", "X", "Y", or "Z" on the four calls that we
899
; would make. (The order doesn't matter, of course.)
900
; "result" would be the appropriate address in the result
901
; area. When multiplying by "Z", that would be the least
902
; significant word. When multiplying by "Y", it would be the
903
; next higher word (8 bytes higher if little-wordian; 8 bytes
904
; lower if big-wordian), and so on. The size of the result
905
; area must be the the sum of the sizes of the multiplicand
906
; and multiplier vectors, and must be initialized to zero
909
; Whenever the routine adds its partial product into the result
910
; vector, it follows carry chains as far as they need to go.
912
; Here is the super-precision multiply routine that I use for
913
; my package. The package is big-wordian. I have taken out
914
; handling of exponents (it's a floating point package):
916
; static void mul_PA20(
918
; const long long int *arg1,
919
; const long long int *arg2,
920
; long long int *result)
924
; for (i=0 ; i<2*size ; i++) result[i] = 0ULL;
926
; for (i=0 ; i<size ; i++) {
927
; maxpy_PA20_big(size, &arg2[i], &arg1[size-1], &result[size+i]);