1
; ***** BEGIN LICENSE BLOCK *****
2
; Version: MPL 1.1/GPL 2.0/LGPL 2.1
4
; The contents of this file are subject to the Mozilla Public License Version
5
; 1.1 (the "License"); you may not use this file except in compliance with
6
; the License. You may obtain a copy of the License at
7
; http://www.mozilla.org/MPL/
9
; Software distributed under the License is distributed on an "AS IS" basis,
10
; WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11
; for the specific language governing rights and limitations under the
14
; The Original Code is MAXPY multiple-precision integer arithmetic.
16
; The Initial Developer of the Original Code is
17
; the Hewlett-Packard Company.
18
; Portions created by the Initial Developer are Copyright (C) 1997
19
; the Initial Developer. All Rights Reserved.
22
; coded by: William B. Ackerman
24
; Alternatively, the contents of this file may be used under the terms of
25
; either the GNU General Public License Version 2 or later (the "GPL"), or
26
; the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27
; in which case the provisions of the GPL or the LGPL are applicable instead
28
; of those above. If you wish to allow use of your version of this file only
29
; under the terms of either the GPL or the LGPL, and not to allow others to
30
; use your version of this file under the terms of the MPL, indicate your
31
; decision by deleting the provisions above and replace them with the notice
32
; and other provisions required by the GPL or the LGPL. If you do not delete
33
; the provisions above, a recipient may use your version of this file under
34
; the terms of any one of the MPL, the GPL or the LGPL.
36
; ***** END LICENSE BLOCK *****
46
.SUBSPA $CODE$,QUAD=0,ALIGN=4,ACCESS=0x2c,CODE_ONLY,SORT=24
48
; ***************************************************************
52
; ***************************************************************
54
; There is no default -- you must specify one or the other.
55
#define LITTLE_WORDIAN 1
62
#define UN_SIXTEEN -16
63
#define UN_TWENTY_FOUR -24
69
#define THIRTY_TWO -32
72
#define UN_TWENTY_FOUR 24
75
; This performs a multiple-precision integer version of "daxpy",
76
; Using the selected addressing direction. "Little-wordian" means that
77
; the least significant word of a number is stored at the lowest address.
78
; "Big-wordian" means that the most significant word is at the lowest
79
; address. Either way, the incoming address of the vector is that
80
; of the least significant word. That means that, for little-wordian
81
; addressing, we move the address upward as we propagate carries
82
; from the least significant word to the most significant. For
83
; big-wordian we move the address downward.
85
; We use the following registers:
87
; r2 return PC, of course
89
; r25 = arg2 = address of scalar
90
; r24 = arg3 = multiplicand vector
91
; r23 = arg4 = result vector
93
; fr9 = scalar loaded once only from r25
95
; The cycle counts shown in the bodies below are simply the result of a
96
; scheduling by hand. The actual PCX-U hardware does it differently.
97
; The intention is that the overall speed is the same.
99
; The pipeline startup and shutdown code is constructed in the usual way,
100
; by taking the loop bodies and removing unnecessary instructions.
101
; We have left the comments describing cycle numbers in the code.
102
; These are intended for reference when comparing with the main loop,
103
; and have no particular relationship to actual cycle numbers.
105
#ifdef LITTLE_WORDIAN
111
.CALLINFO FRAME=120,ENTRY_GR=%r4
114
; Of course, real men don't use the sissy "enter" and "leave" commands.
115
; They write their own stack manipulation stuff. Unfortunately,
116
; that doesn't generate complete unwind info, whereas "enter" and
117
; "leave" (if the documentation is to be believed) do so. Therefore,
118
; we use the sissy commands. We have verified (by real-man methods)
119
; that the above command generates what we want:
120
; STW,MA %r3,128(%sp)
123
ADDIB,< -1,%r26,$L0 ; If N = 0, exit immediately.
124
FLDD 0(%r25),%fr9 ; fr9 = scalar
128
FLDD 0(%r24),%fr24 ; Cycle 1
129
XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3
130
XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4
131
XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5
132
CMPIB,> 3,%r26,$N_IS_SMALL ; Pick out cases N = 1, 2, or 3
133
XMPYU %fr9L,%fr24R,%fr24 ; Cycle 6
134
FLDD EIGHT(%r24),%fr28 ; Cycle 8
135
XMPYU %fr9L,%fr28R,%fr31 ; Cycle 10
137
XMPYU %fr9R,%fr28L,%fr30 ; Cycle 11
139
LDO SIXTEEN(%r24),%r24 ; Cycle 12
141
XMPYU %fr9R,%fr28R,%fr29 ; Cycle 13
146
XMPYU %fr9L,%fr28L,%fr28 ; Cycle 1
150
FSTD %fr26,-88(%sp) ; Cycle 2
152
XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3
155
XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4
159
XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5
163
XMPYU %fr9L,%fr24R,%fr24 ; Cycle 6
167
ADD,DC %r20,%r19,%r19 ; Cycle 7
169
SHRPD %r3,%r0,32,%r21
172
FLDD EIGHT(%r24),%fr28 ; Cycle 8
175
SHRPD %r19,%r3,32,%r3
177
LDD -72(%sp),%r29 ; Cycle 9
178
SHRPD %r20,%r19,32,%r20
181
XMPYU %fr9L,%fr28R,%fr31 ; Cycle 10
185
XMPYU %fr9R,%fr28L,%fr30 ; Cycle 11
190
LDO SIXTEEN(%r24),%r24 ; Cycle 12
193
XMPYU %fr9R,%fr28R,%fr29 ; Cycle 13
194
ADD %r0,%r0,%r0 ; clear the carry bit
195
ADDIB,<= -4,%r26,$ENDLOOP ; actually happens in cycle 12
197
; MFCTL %cr16,%r21 ; for timing
202
$LOOP XMPYU %fr9L,%fr28L,%fr28 ; Cycle 1
207
LDO SIXTEEN(%r23),%r23 ; Cycle 2
211
XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3
214
LDD UN_EIGHT(%r23),%r21
216
XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4
221
XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5
222
ADD,DC %r20,%r31,%r22
226
XMPYU %fr9L,%fr24R,%fr24 ; Cycle 6
229
STD %r1,UN_SIXTEEN(%r23)
231
ADD,DC %r20,%r19,%r19 ; Cycle 7
232
SHRPD %r3,%r0,32,%r21
236
ADD,DC %r0,%r0,%r20 ; Cycle 8
237
SHRPD %r19,%r3,32,%r3
238
FLDD EIGHT(%r24),%fr28
241
SHRPD %r20,%r19,32,%r20 ; Cycle 9
243
STD %r28,UN_EIGHT(%r23)
246
XMPYU %fr9L,%fr28R,%fr31 ; Cycle 10
250
XMPYU %fr9R,%fr28L,%fr30 ; Cycle 11
255
LDO SIXTEEN(%r24),%r24 ; Cycle 12
258
XMPYU %fr9R,%fr28R,%fr29 ; Cycle 13
260
ADDIB,> -2,%r26,$LOOP ; actually happens in cycle 12
265
; Shutdown code, first stage.
267
; MFCTL %cr16,%r21 ; for timing
268
; STD %r21,UN_SIXTEEN(%r23)
270
; STD %r21,UN_EIGHT(%r23)
272
XMPYU %fr9L,%fr28L,%fr28 ; Cycle 1
274
CMPIB,= 0,%r26,$ONEMORE
277
LDO SIXTEEN(%r23),%r23 ; Cycle 2
281
ADD %r3,%r1,%r1 ; Cycle 3
283
LDD UN_EIGHT(%r23),%r21
285
ADD,DC %r21,%r4,%r28 ; Cycle 4
287
STD %r28,UN_EIGHT(%r23) ; moved up from cycle 9
290
ADD,DC %r20,%r31,%r22 ; Cycle 5
291
STD %r1,UN_SIXTEEN(%r23)
296
ADD %r21,%r3,%r3 ; Cycle 6
299
ADD,DC %r20,%r19,%r19 ; Cycle 7
300
SHRPD %r3,%r0,32,%r21
304
ADD,DC %r0,%r0,%r20 ; Cycle 8
305
SHRPD %r19,%r3,32,%r3
308
SHRPD %r20,%r19,32,%r20 ; Cycle 9
312
ADD,DC %r3,%r4,%r4 ; Cycle 10
314
ADD,DC %r0,%r20,%r20 ; Cycle 11
317
ADD %r22,%r1,%r1 ; Cycle 13
319
; Shutdown code, second stage.
321
ADD,DC %r29,%r4,%r4 ; Cycle 1
323
LDO SIXTEEN(%r23),%r23 ; Cycle 2
326
LDD UN_EIGHT(%r23),%r21 ; Cycle 3
329
ADD,DC %r21,%r4,%r28 ; Cycle 4
331
ADD,DC %r20,%r31,%r22 ; Cycle 5
333
STD %r1,UN_SIXTEEN(%r23); Cycle 6
335
STD %r28,UN_EIGHT(%r23) ; Cycle 9
337
LDD 0(%r23),%r3 ; Cycle 11
339
; Shutdown code, third stage.
341
LDO SIXTEEN(%r23),%r23
343
$JOIN1 ADD,DC %r0,%r0,%r21
344
CMPIB,*= 0,%r21,$L0 ; if no overflow, exit
345
STD %r1,UN_SIXTEEN(%r23)
347
; Final carry propagation
349
$FINAL1 LDO EIGHT(%r23),%r23
350
LDD UN_SIXTEEN(%r23),%r21
352
CMPIB,*= 0,%r21,$FINAL1 ; Keep looping if there is a carry.
353
STD %r21,UN_SIXTEEN(%r23)
357
; Here is the code that handles the difficult cases N=1, N=2, and N=3.
358
; We do the usual trick -- branch out of the startup code at appropriate
359
; points, and branch into the shutdown code.
362
CMPIB,= 0,%r26,$N_IS_ONE
363
FSTD %fr24,-96(%sp) ; Cycle 10
364
FLDD EIGHT(%r24),%fr28 ; Cycle 8
365
XMPYU %fr9L,%fr28R,%fr31 ; Cycle 10
366
XMPYU %fr9R,%fr28L,%fr30 ; Cycle 11
368
FSTD %fr31,-64(%sp) ; Cycle 12
369
XMPYU %fr9R,%fr28R,%fr29 ; Cycle 13
371
XMPYU %fr9L,%fr28L,%fr28 ; Cycle 1
372
CMPIB,= 2,%r26,$N_IS_THREE
376
FSTD %fr26,-88(%sp) ; Cycle 2
377
FSTD %fr28,-104(%sp) ; Cycle 3
378
LDD -96(%sp),%r3 ; Cycle 4
384
FLDD SIXTEEN(%r24),%fr24
385
FSTD %fr26,-88(%sp) ; Cycle 2
386
XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3
388
XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4
391
XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5
400
FSTD %fr26,-88(%sp) ; Cycle 2
404
; We came out of the unrolled loop with wrong parity. Do one more
405
; single cycle. This is quite tricky, because of the way the
406
; carry chains and SHRPD chains have been chopped up.
412
LDO SIXTEEN(%r23),%r23 ; Cycle 2
416
XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3
418
LDD UN_EIGHT(%r23),%r21
421
XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4
423
STD %r28,UN_EIGHT(%r23) ; moved from cycle 9
427
XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5
428
ADD,DC %r20,%r31,%r22
432
STD %r1,UN_SIXTEEN(%r23); Cycle 6
434
XMPYU %fr9L,%fr24R,%fr24
438
ADD,DC %r20,%r19,%r19 ; Cycle 7
440
SHRPD %r3,%r0,32,%r21
443
LDD -104(%sp),%r31 ; Cycle 8
445
SHRPD %r19,%r3,32,%r3
447
LDD -72(%sp),%r29 ; Cycle 9
448
SHRPD %r20,%r19,32,%r20
451
ADD,DC %r3,%r4,%r4 ; Cycle 10
454
ADD,DC %r0,%r20,%r20 ; Cycle 11
458
ADD %r22,%r1,%r1 ; Cycle 13
461
; Shutdown code, stage 1-1/2.
463
ADD,DC %r29,%r4,%r4 ; Cycle 1
465
LDO SIXTEEN(%r23),%r23 ; Cycle 2
469
LDD UN_EIGHT(%r23),%r21 ; Cycle 3
472
ADD,DC %r21,%r4,%r28 ; Cycle 4
473
STD %r28,UN_EIGHT(%r23) ; moved from cycle 9
475
ADD,DC %r20,%r31,%r22 ; Cycle 5
476
STD %r1,UN_SIXTEEN(%r23)
478
LDD -96(%sp),%r3 ; moved from cycle 4
480
ADD %r21,%r3,%r3 ; Cycle 6
481
ADD,DC %r0,%r0,%r19 ; Cycle 7
483
SHRPD %r3,%r0,32,%r21
485
SHRPD %r19,%r3,32,%r3 ; Cycle 8
486
ADD %r21,%r1,%r1 ; Cycle 9
487
ADD,DC %r3,%r4,%r4 ; Cycle 10
488
LDD 0(%r23),%r3 ; Cycle 11
489
ADD %r22,%r1,%r1 ; Cycle 13
491
; Shutdown code, stage 2-1/2.
493
ADD,DC %r0,%r4,%r4 ; Cycle 1
494
LDO SIXTEEN(%r23),%r23 ; Cycle 2
495
LDD UN_EIGHT(%r23),%r21 ; Cycle 3
497
STD %r1,UN_SIXTEEN(%r23)
507
; We have verified that the above command generates what we want:
510
; LDW,MB -128(%sp),%r3
514
; ***************************************************************
516
; add_diag_[little/big]
518
; ***************************************************************
520
; The arguments are as follows:
521
; r2 return PC, of course
522
; r26 = arg1 = length
523
; r25 = arg2 = vector to square
524
; r24 = arg3 = result vector
526
#ifdef LITTLE_WORDIAN
532
.CALLINFO FRAME=120,ENTRY_GR=%r4
535
ADDIB,< -1,%r26,$Z0 ; If N=0, exit immediately.
540
FLDD 0(%r25),%fr7 ; Cycle 2 (alternate body)
541
XMPYU %fr7R,%fr7R,%fr29 ; Cycle 4
542
XMPYU %fr7L,%fr7R,%fr27 ; Cycle 5
543
XMPYU %fr7L,%fr7L,%fr30
544
LDO SIXTEEN(%r25),%r25 ; Cycle 6
546
FSTD %fr27,-72(%sp) ; Cycle 7
547
CMPIB,= 0,%r26,$DIAG_N_IS_ONE ; Cycle 1 (main body)
549
FLDD UN_EIGHT(%r25),%fr7 ; Cycle 2
550
LDD -88(%sp),%r22 ; Cycle 3
551
LDD -72(%sp),%r31 ; Cycle 4
552
XMPYU %fr7R,%fr7R,%fr28
553
XMPYU %fr7L,%fr7R,%fr24 ; Cycle 5
554
XMPYU %fr7L,%fr7L,%fr31
555
LDD -96(%sp),%r20 ; Cycle 6
557
ADD %r0,%r0,%r0 ; clear the carry bit
558
ADDIB,<= -2,%r26,$ENDDIAGLOOP ; Cycle 7
561
; Here is the loop. It is unrolled twice, modelled after the "alternate body" and then the "main body".
564
SHRPD %r31,%r0,31,%r3 ; Cycle 1 (alternate body)
565
LDO SIXTEEN(%r25),%r25
568
SHRPD %r0,%r31,31,%r4 ; Cycle 2
570
FLDD UN_SIXTEEN(%r25),%fr7
571
ADD,DC %r0,%r20,%r20 ; Cycle 3
573
XMPYU %fr7R,%fr7R,%fr29 ; Cycle 4
576
XMPYU %fr7L,%fr7R,%fr27 ; Cycle 5
577
XMPYU %fr7L,%fr7L,%fr30
580
ADD,DC %r4,%r20,%r20 ; Cycle 6
583
ADD %r20,%r1,%r1 ; Cycle 7
585
SHRPD %r29,%r0,31,%r4 ; Cycle 1 (main body)
586
LDO THIRTY_TWO(%r24),%r24
587
LDD UN_SIXTEEN(%r24),%r28
589
SHRPD %r0,%r29,31,%r3 ; Cycle 2
591
FLDD UN_EIGHT(%r25),%fr7
592
STD %r1,UN_TWENTY_FOUR(%r24)
593
ADD,DC %r0,%r19,%r19 ; Cycle 3
595
XMPYU %fr7R,%fr7R,%fr28 ; Cycle 4
597
STD %r4,UN_SIXTEEN(%r24)
598
XMPYU %fr7L,%fr7R,%fr24 ; Cycle 5
599
XMPYU %fr7L,%fr7L,%fr31
601
LDD UN_EIGHT(%r24),%r28
602
ADD,DC %r3,%r19,%r19 ; Cycle 6
605
ADD %r19,%r28,%r28 ; Cycle 7
607
ADDIB,> -2,%r26,$DIAGLOOP ; Cycle 8
608
STD %r28,UN_EIGHT(%r24)
613
CMPIB,= 0,%r26,$ONEMOREDIAG
614
SHRPD %r31,%r0,31,%r3
616
; Shutdown code, first stage.
618
FSTD %fr31,-104(%sp) ; Cycle 1 (alternate body)
620
SHRPD %r0,%r31,31,%r4 ; Cycle 2
622
ADD,DC %r0,%r20,%r20 ; Cycle 3
625
LDD -64(%sp),%r29 ; Cycle 4
627
LDD EIGHT(%r24),%r1 ; Cycle 5
628
LDO SIXTEEN(%r25),%r25 ; Cycle 6
631
ADD %r20,%r1,%r1 ; Cycle 7
632
ADD,DC %r0,%r21,%r21 ; Cycle 8
635
; Shutdown code, second stage.
637
SHRPD %r29,%r0,31,%r4 ; Cycle 1 (main body)
638
LDO THIRTY_TWO(%r24),%r24
639
LDD UN_SIXTEEN(%r24),%r1
640
SHRPD %r0,%r29,31,%r3 ; Cycle 2
642
ADD,DC %r0,%r19,%r19 ; Cycle 3
644
STD %r4,UN_SIXTEEN(%r24); Cycle 4
645
LDD UN_EIGHT(%r24),%r28 ; Cycle 5
646
ADD,DC %r3,%r19,%r19 ; Cycle 6
647
ADD %r19,%r28,%r28 ; Cycle 7
648
ADD,DC %r0,%r0,%r22 ; Cycle 8
649
CMPIB,*= 0,%r22,$Z0 ; if no overflow, exit
650
STD %r28,UN_EIGHT(%r24)
652
; Final carry propagation
656
LDD UN_EIGHT(%r24),%r26
658
CMPIB,*= 0,%r26,$FDIAG2 ; Keep looping if there is a carry.
659
STD %r26,UN_EIGHT(%r24)
664
; Here is the code that handles the difficult case N=1.
665
; We do the usual trick -- branch out of the startup code at appropriate
666
; points, and branch into the shutdown code.
675
; We came out of the unrolled loop with wrong parity. Do one more
676
; single cycle. This is the "alternate body". It will, of course,
677
; give us opposite registers from the other case, so we need
678
; completely different shutdown code.
681
FSTD %fr31,-104(%sp) ; Cycle 1 (alternate body)
683
FLDD 0(%r25),%fr7 ; Cycle 2
684
SHRPD %r0,%r31,31,%r4
686
ADD,DC %r0,%r20,%r20 ; Cycle 3
689
LDD -64(%sp),%r29 ; Cycle 4
691
XMPYU %fr7R,%fr7R,%fr29
692
LDD EIGHT(%r24),%r1 ; Cycle 5
693
XMPYU %fr7L,%fr7R,%fr27
694
XMPYU %fr7L,%fr7L,%fr30
695
LDD -104(%sp),%r19 ; Cycle 6
698
FSTD %fr27,-72(%sp) ; Cycle 7
700
ADD,DC %r0,%r21,%r21 ; Cycle 8
703
; Shutdown code, first stage.
705
SHRPD %r29,%r0,31,%r4 ; Cycle 1 (main body)
706
LDO THIRTY_TWO(%r24),%r24
708
LDD UN_SIXTEEN(%r24),%r1
709
SHRPD %r0,%r29,31,%r3 ; Cycle 2
711
ADD,DC %r0,%r19,%r19 ; Cycle 3
714
LDD -72(%sp),%r31 ; Cycle 4
715
STD %r4,UN_SIXTEEN(%r24)
716
LDD UN_EIGHT(%r24),%r28 ; Cycle 5
717
LDD -96(%sp),%r20 ; Cycle 6
719
ADD %r19,%r28,%r28 ; Cycle 7
720
ADD,DC %r0,%r22,%r22 ; Cycle 8
721
STD %r28,UN_EIGHT(%r24)
723
; Shutdown code, second stage.
726
SHRPD %r31,%r0,31,%r3 ; Cycle 1 (alternate body)
728
SHRPD %r0,%r31,31,%r4 ; Cycle 2
730
ADD,DC %r0,%r20,%r20 ; Cycle 3
732
STD %r3,0(%r24) ; Cycle 4
733
LDD EIGHT(%r24),%r1 ; Cycle 5
735
ADD %r20,%r1,%r1 ; Cycle 7
736
ADD,DC %r0,%r0,%r21 ; Cycle 8
737
CMPIB,*= 0,%r21,$Z0 ; if no overflow, exit
740
; Final carry propagation
746
CMPIB,*= 0,%r26,$FDIAG1 ; Keep looping if there is a carry.
756
#ifdef LITTLE_WORDIAN
757
.EXPORT maxpy_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,LONG_RETURN
758
.EXPORT add_diag_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,LONG_RETURN
760
.EXPORT maxpy_big,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,LONG_RETURN
761
.EXPORT add_diag_big,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,LONG_RETURN
766
; How to use "maxpy_PA20_little" and "maxpy_PA20_big"
768
; The routine "maxpy_PA20_little" or "maxpy_PA20_big"
769
; performs a 64-bit x any-size multiply, and adds the
770
; result to an area of memory. That is, it performs
778
; and then adds the "PQRST" vector into an area of memory,
779
; handling all carries.
781
; Digression on nomenclature and endian-ness:
783
; Each of the capital letters in the above represents a 64-bit
784
; quantity. That is, you could think of the discussion as
785
; being in terms of radix-16-quintillion arithmetic. The data
786
; type being manipulated is "unsigned long long int". This
787
; requires the 64-bit extension of the HP-UX C compiler,
788
; available at release 10. You need these compiler flags to
789
; enable these extensions:
791
; -Aa +e +DA2.0 +DS2.0
793
; (The first specifies ANSI C, the second enables the
794
; extensions, which are beyond ANSI C, and the third and
795
; fourth tell the compiler to use whatever features of the
796
; PA2.0 architecture it wishes, in order to made the code more
797
; efficient. Since the presence of the assembly code will
798
; make the program unable to run on anything less than PA2.0,
799
; you might as well gain the performance enhancements in the C
802
; Questions of "endian-ness" often come up, usually in the
803
; context of byte ordering in a word. These routines have a
804
; similar issue, that could be called "wordian-ness".
805
; Independent of byte ordering (PA is always big-endian), one
806
; can make two choices when representing extremely large
807
; numbers as arrays of 64-bit doublewords in memory.
809
; "Little-wordian" layout means that the least significant
810
; word of a number is stored at the lowest address.
819
; | | |____ address 0
821
; | |_______address 8
825
; "Big-wordian" means that the most significant word is at the
835
; | | |____ address 32
837
; | |_______address 24
841
; When you compile the file, you must specify one or the other, with
842
; a switch "-DLITTLE_WORDIAN" or "-DBIG_WORDIAN".
844
; Incidentally, you assemble this file as part of your
845
; project with the same C compiler as the rest of the program.
846
; My "makefile" for a superprecision arithmetic package has
847
; the following stuff:
850
; CC = cc -Aa +e -z +DA2.0 +DS2.0 +w1
852
; LDFLAGS = -L /usr/lib -Wl,-aarchive
854
; # general build rule for ".s" files:
856
; $(CC) $(CFLAGS) -c $< -DBIG_WORDIAN
858
; # Now any bind step that calls for pa20.o will assemble pa20.s
860
; End of digression, back to arithmetic:
862
; The way we multiply two huge numbers is, of course, to multiply
863
; the "ABCD" vector by each of the "WXYZ" doublewords, adding
864
; the result vectors with increasing offsets, the way we learned
865
; in school, back before we all used calculators:
877
; So we call maxpy_PA20_big (in my case; my package is
878
; big-wordian) repeatedly, giving the W, X, Y, and Z arguments
879
; in turn as the "scalar", and giving the "ABCD" vector each
880
; time. We direct it to add its result into an area of memory
881
; that we have cleared at the start. We skew the exact
882
; location into that area with each call.
884
; The prototype for the function is
886
; extern void maxpy_PA20_big(
887
; int length, /* Number of doublewords in the multiplicand vector. */
888
; const long long int *scalaraddr, /* Address to fetch the scalar. */
889
; const long long int *multiplicand, /* The multiplicand vector. */
890
; long long int *result); /* Where to accumulate the result. */
892
; (You should place a copy of this prototype in an include file
893
; or in your C file.)
895
; Now, IN ALL CASES, the given address for the multiplicand or
896
; the result is that of the LEAST SIGNIFICANT DOUBLEWORD.
897
; That word is, of course, the word at which the routine
898
; starts processing. "maxpy_PA20_little" then increases the
899
; addresses as it computes. "maxpy_PA20_big" decreases them.
901
; In our example above, "length" would be 4 in each case.
902
; "multiplicand" would be the "ABCD" vector. Specifically,
903
; the address of the element "D". "scalaraddr" would be the
904
; address of "W", "X", "Y", or "Z" on the four calls that we
905
; would make. (The order doesn't matter, of course.)
906
; "result" would be the appropriate address in the result
907
; area. When multiplying by "Z", that would be the least
908
; significant word. When multiplying by "Y", it would be the
909
; next higher word (8 bytes higher if little-wordian; 8 bytes
910
; lower if big-wordian), and so on. The size of the result
911
; area must be the the sum of the sizes of the multiplicand
912
; and multiplier vectors, and must be initialized to zero
915
; Whenever the routine adds its partial product into the result
916
; vector, it follows carry chains as far as they need to go.
918
; Here is the super-precision multiply routine that I use for
919
; my package. The package is big-wordian. I have taken out
920
; handling of exponents (it's a floating point package):
922
; static void mul_PA20(
924
; const long long int *arg1,
925
; const long long int *arg2,
926
; long long int *result)
930
; for (i=0 ; i<2*size ; i++) result[i] = 0ULL;
932
; for (i=0 ; i<size ; i++) {
933
; maxpy_PA20_big(size, &arg2[i], &arg1[size-1], &result[size+i]);