3
.ident "ia64.S, Version 2.0"
4
.ident "IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
7
// ====================================================================
8
// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
11
// Rights for redistribution and usage in source and binary forms are
12
// granted according to the OpenSSL license. Warranty of any kind is
14
// ====================================================================
16
// Version 2.x is Itanium2 re-tune. Few words about how Itanum2 is
17
// different from Itanium to this module viewpoint. Most notably, is it
18
// "wider" than Itanium? Can you experience loop scalability as
19
// discussed in commentary sections? Not really:-( Itanium2 has 6
20
// integer ALU ports, i.e. it's 2 ports wider, but it's not enough to
21
// spin twice as fast, as I need 8 IALU ports. Amount of floating point
22
// ports is the same, i.e. 2, while I need 4. In other words, to this
23
// module Itanium2 remains effectively as "wide" as Itanium. Yet it's
24
// essentially different in respect to this module, and a re-tune was
25
// required. Well, because some intruction latencies has changed. Most
26
// noticeably those intensively used:
32
// xma[->getf] 7[+1] 4[+0]
33
// add[->st8] 1[+1] 1[+0]
35
// What does it mean? You might ratiocinate that the original code
36
// should run just faster... Because sum of latencies is smaller...
37
// Wrong! Note that getf latency increased. This means that if a loop is
38
// scheduled for lower latency (and they are), then it will suffer from
39
// stall condition and the code will therefore turn anti-scalable, e.g.
40
// original bn_mul_words spun at 5*n or 2.5 times slower than expected
41
// on Itanium2! What to do? Reschedule loops for Itanium2? But then
42
// Itanium would exhibit anti-scalability. So I've chosen to reschedule
43
// for worst latency for every instruction aiming for best *all-round*
46
// Q. How much faster does it get?
47
// A. Here is the output from 'openssl speed rsa dsa' for vanilla
48
// 0.9.6a compiled with gcc version 2.96 20000731 (Red Hat
49
// Linux 7.1 2.96-81):
51
// sign verify sign/s verify/s
52
// rsa 512 bits 0.0036s 0.0003s 275.3 2999.2
53
// rsa 1024 bits 0.0203s 0.0011s 49.3 894.1
54
// rsa 2048 bits 0.1331s 0.0040s 7.5 250.9
55
// rsa 4096 bits 0.9270s 0.0147s 1.1 68.1
56
// sign verify sign/s verify/s
57
// dsa 512 bits 0.0035s 0.0043s 288.3 234.8
58
// dsa 1024 bits 0.0111s 0.0135s 90.0 74.2
60
// And here is similar output but for this assembler
63
// sign verify sign/s verify/s
64
// rsa 512 bits 0.0021s 0.0001s 549.4 9638.5
65
// rsa 1024 bits 0.0055s 0.0002s 183.8 4481.1
66
// rsa 2048 bits 0.0244s 0.0006s 41.4 1726.3
67
// rsa 4096 bits 0.1295s 0.0018s 7.7 561.5
68
// sign verify sign/s verify/s
69
// dsa 512 bits 0.0012s 0.0013s 891.9 756.6
70
// dsa 1024 bits 0.0023s 0.0028s 440.4 376.2
72
// Yes, you may argue that it's not fair comparison as it's
73
// possible to craft the C implementation with BN_UMULT_HIGH
74
// inline assembler macro. But of course! Here is the output
77
// sign verify sign/s verify/s
78
// rsa 512 bits 0.0020s 0.0002s 495.0 6561.0
79
// rsa 1024 bits 0.0086s 0.0004s 116.2 2235.7
80
// rsa 2048 bits 0.0519s 0.0015s 19.3 667.3
81
// rsa 4096 bits 0.3464s 0.0053s 2.9 187.7
82
// sign verify sign/s verify/s
83
// dsa 512 bits 0.0016s 0.0020s 613.1 510.5
84
// dsa 1024 bits 0.0045s 0.0054s 221.0 183.9
86
// My code is still way faster, huh:-) And I believe that even
87
// higher performance can be achieved. Note that as keys get
88
// longer, performance gain is larger. Why? According to the
89
// profiler there is another player in the field, namely
90
// BN_from_montgomery consuming larger and larger portion of CPU
91
// time as keysize decreases. I therefore consider putting effort
92
// to assembler implementation of the following routine:
94
// void bn_mul_add_mont (BN_ULONG *rp,BN_ULONG *np,int nl,BN_ULONG n0)
99
// for (i=0; i<nl; i++)
101
// v=bn_mul_add_words(rp,np,nl,(rp[0]*n0)&BN_MASK2);
104
// if (((nrp[-1]+=v)&BN_MASK2) < v)
105
// for (j=0; ((++nrp[j])&BN_MASK2) == 0; j++) ;
109
// It might as well be beneficial to implement even combaX
110
// variants, as it appears as it can literally unleash the
111
// performance (see comment section to bn_mul_comba8 below).
113
// And finally for your reference the output for 0.9.6a compiled
114
// with SGIcc version 0.01.0-12 (keep in mind that for the moment
115
// of this writing it's not possible to convince SGIcc to use
116
// BN_UMULT_HIGH inline assembler macro, yet the code is fast,
117
// i.e. for a compiler generated one:-):
119
// sign verify sign/s verify/s
120
// rsa 512 bits 0.0022s 0.0002s 452.7 5894.3
121
// rsa 1024 bits 0.0097s 0.0005s 102.7 2002.9
122
// rsa 2048 bits 0.0578s 0.0017s 17.3 600.2
123
// rsa 4096 bits 0.3838s 0.0061s 2.6 164.5
124
// sign verify sign/s verify/s
125
// dsa 512 bits 0.0018s 0.0022s 547.3 459.6
126
// dsa 1024 bits 0.0051s 0.0062s 196.6 161.3
128
// Oh! Benchmarks were performed on 733MHz Lion-class Itanium
129
// system running Redhat Linux 7.1 (very special thanks to Ray
130
// McCaffity of Williams Communications for providing an account).
132
// Q. What's the heck with 'rum 1<<5' at the end of every function?
133
// A. Well, by clearing the "upper FP registers written" bit of the
134
// User Mask I want to excuse the kernel from preserving upper
135
// (f32-f128) FP register bank over process context switch, thus
136
// minimizing bus bandwidth consumption during the switch (i.e.
137
// after PKI opration completes and the program is off doing
138
// something else like bulk symmetric encryption). Having said
139
// this, I also want to point out that it might be good idea
140
// to compile the whole toolkit (as well as majority of the
141
// programs for that matter) with -mfixed-range=f32-f127 command
142
// line option. No, it doesn't prevent the compiler from writing
143
// to upper bank, but at least discourages to do so. If you don't
144
// like the idea you have the option to compile the module with
145
// -Drum=nop.m in command line.
150
// bn_[add|sub]_words routines.
152
// Loops are spinning in 2*(n+5) ticks on Itanuim (provided that the
153
// data reside in L1 cache, i.e. 2 ticks away). It's possible to
154
// compress the epilogue and get down to 2*n+6, but at the cost of
155
// scalability (the neat feature of this implementation is that it
156
// shall automagically spin in n+5 on "wider" IA-64 implementations:-)
157
// I consider that the epilogue is short enough as it is to trade tiny
158
// performance loss on Itanium for scalability.
160
// BN_ULONG bn_add_words(BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int num)
162
.global bn_add_words#
165
.skip 32 // makes the loop body aligned at 64-byte boundary
170
{ .mii; alloc r2=ar.pfs,4,12,0,16
171
cmp4.le p6,p0=r35,r0 };;
172
{ .mfb; mov r8=r0 // return value
173
(p6) br.ret.spnt.many b0 };;
176
{ .mib; sub r10=r35,r0,1
178
brp.loop.imp .L_bn_add_words_ctop,.L_bn_add_words_cend-16
182
#if defined(_HPUX_SOURCE) && defined(_ILP32)
183
addp4 r14=0,r32 // rp
189
#if defined(_HPUX_SOURCE) && defined(_ILP32)
190
addp4 r15=0,r33 // ap
197
#if defined(_HPUX_SOURCE) && defined(_ILP32)
198
addp4 r16=0,r34 // bp
204
.L_bn_add_words_ctop:
205
{ .mii; (p16) ld8 r32=[r16],8 // b=*(bp++)
206
(p18) add r39=r37,r34
207
(p19) cmp.ltu.unc p56,p0=r40,r38 }
208
{ .mfb; (p0) nop.m 0x0
211
{ .mii; (p16) ld8 r35=[r15],8 // a=*(ap++)
212
(p58) cmp.eq.or p57,p0=-1,r41 // (p20)
213
(p58) add r41=1,r41 } // (p20)
214
{ .mfb; (p21) st8 [r14]=r42,8 // *(rp++)=r
216
br.ctop.sptk .L_bn_add_words_ctop };;
217
.L_bn_add_words_cend:
220
(p59) add r8=1,r8 // return value
224
br.ret.sptk.many b0 };;
228
// BN_ULONG bn_sub_words(BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int num)
230
.global bn_sub_words#
233
.skip 32 // makes the loop body aligned at 64-byte boundary
238
{ .mii; alloc r2=ar.pfs,4,12,0,16
239
cmp4.le p6,p0=r35,r0 };;
240
{ .mfb; mov r8=r0 // return value
241
(p6) br.ret.spnt.many b0 };;
244
{ .mib; sub r10=r35,r0,1
246
brp.loop.imp .L_bn_sub_words_ctop,.L_bn_sub_words_cend-16
250
#if defined(_HPUX_SOURCE) && defined(_ILP32)
251
addp4 r14=0,r32 // rp
257
#if defined(_HPUX_SOURCE) && defined(_ILP32)
258
addp4 r15=0,r33 // ap
265
#if defined(_HPUX_SOURCE) && defined(_ILP32)
266
addp4 r16=0,r34 // bp
272
.L_bn_sub_words_ctop:
273
{ .mii; (p16) ld8 r32=[r16],8 // b=*(bp++)
274
(p18) sub r39=r37,r34
275
(p19) cmp.gtu.unc p56,p0=r40,r38 }
276
{ .mfb; (p0) nop.m 0x0
279
{ .mii; (p16) ld8 r35=[r15],8 // a=*(ap++)
280
(p58) cmp.eq.or p57,p0=0,r41 // (p20)
281
(p58) add r41=-1,r41 } // (p20)
282
{ .mbb; (p21) st8 [r14]=r42,8 // *(rp++)=r
284
br.ctop.sptk .L_bn_sub_words_ctop };;
285
.L_bn_sub_words_cend:
288
(p59) add r8=1,r8 // return value
292
br.ret.sptk.many b0 };;
297
#define XMA_TEMPTATION
302
// BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
304
.global bn_mul_words#
307
.skip 32 // makes the loop body aligned at 64-byte boundary
312
#ifdef XMA_TEMPTATION
313
{ .mfi; alloc r2=ar.pfs,4,0,0,0 };;
315
{ .mfi; alloc r2=ar.pfs,4,12,0,16 };;
317
{ .mib; mov r8=r0 // return value
319
(p6) br.ret.spnt.many b0 };;
322
{ .mii; sub r10=r34,r0,1
327
{ .mib; setf.sig f8=r35 // w
328
mov pr.rot=0x800001<<16
329
// ------^----- serves as (p50) at first (p27)
330
brp.loop.imp .L_bn_mul_words_ctop,.L_bn_mul_words_cend-16
333
#ifndef XMA_TEMPTATION
336
#if defined(_HPUX_SOURCE) && defined(_ILP32)
337
addp4 r14=0,r32 // rp
338
addp4 r15=0,r33 // ap
344
{ .mii; mov r40=0 // serves as r35 at first (p27)
347
// This loop spins in 2*(n+12) ticks. It's scheduled for data in Itanium
348
// L2 cache (i.e. 9 ticks away) as floating point load/store instructions
349
// bypass L1 cache and L2 latency is actually best-case scenario for
350
// ldf8. The loop is not scalable and shall run in 2*(n+12) even on
351
// "wider" IA-64 implementations. It's a trade-off here. n+24 loop
352
// would give us ~5% in *overall* performance improvement on "wider"
353
// IA-64, but would hurt Itanium for about same because of longer
354
// epilogue. As it's a matter of few percents in either case I've
355
// chosen to trade the scalability for development time (you can see
356
// this very instruction sequence in bn_mul_add_words loop which in
357
// turn is scalable).
358
.L_bn_mul_words_ctop:
359
{ .mfi; (p25) getf.sig r36=f52 // low
360
(p21) xmpy.lu f48=f37,f8
361
(p28) cmp.ltu p54,p50=r41,r39 }
362
{ .mfi; (p16) ldf8 f32=[r15],8
363
(p21) xmpy.hu f40=f37,f8
365
{ .mii; (p25) getf.sig r32=f44 // high
366
.pred.rel "mutex",p50,p54
367
(p50) add r40=r38,r35 // (p27)
368
(p54) add r40=r38,r35,1 } // (p27)
369
{ .mfb; (p28) st8 [r14]=r41,8
371
br.ctop.sptk .L_bn_mul_words_ctop };;
372
.L_bn_mul_words_cend:
375
.pred.rel "mutex",p51,p55
377
(p55) add r8=r36,r0,1 }
382
#else // XMA_TEMPTATION
384
setf.sig f37=r0 // serves as carry at (p18) tick
388
// Most of you examining this code very likely wonder why in the name
389
// of Intel the following loop is commented out? Indeed, it looks so
390
// neat that you find it hard to believe that it's something wrong
391
// with it, right? The catch is that every iteration depends on the
392
// result from previous one and the latter isn't available instantly.
393
// The loop therefore spins at the latency of xma minus 1, or in other
394
// words at 6*(n+4) ticks:-( Compare to the "production" loop above
395
// that runs in 2*(n+11) where the low latency problem is worked around
396
// by moving the dependency to one-tick latent interger ALU. Note that
397
// "distance" between ldf8 and xma is not latency of ldf8, but the
398
// *difference* between xma and ldf8 latencies.
399
.L_bn_mul_words_ctop:
400
{ .mfi; (p16) ldf8 f32=[r33],8
401
(p18) xma.hu f38=f34,f8,f39 }
402
{ .mfb; (p20) stf8 [r32]=f37,8
403
(p18) xma.lu f35=f34,f8,f39
404
br.ctop.sptk .L_bn_mul_words_ctop };;
405
.L_bn_mul_words_cend:
407
getf.sig r8=f41 // the return value
409
#endif // XMA_TEMPTATION
414
{ .mfb; rum 1<<5 // clear um.mfh
416
br.ret.sptk.many b0 };;
422
// BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
424
.global bn_mul_add_words#
425
.proc bn_mul_add_words#
427
//.skip 0 // makes the loop split at 64-byte boundary
432
{ .mii; alloc r2=ar.pfs,4,12,0,16
433
cmp4.le p6,p0=r34,r0 };;
434
{ .mfb; mov r8=r0 // return value
435
(p6) br.ret.spnt.many b0 };;
438
{ .mii; sub r10=r34,r0,1
443
{ .mib; setf.sig f8=r35 // w
444
mov pr.rot=0x800001<<16
445
// ------^----- serves as (p50) at first (p27)
446
brp.loop.imp .L_bn_mul_add_words_ctop,.L_bn_mul_add_words_cend-16
449
#if defined(_HPUX_SOURCE) && defined(_ILP32)
450
addp4 r14=0,r32 // rp
451
addp4 r15=0,r33 // ap
457
{ .mii; mov r40=0 // serves as r35 at first (p27)
458
#if defined(_HPUX_SOURCE) && defined(_ILP32)
459
addp4 r18=0,r32 // rp copy
461
mov r18=r32 // rp copy
465
// This loop spins in 3*(n+14) ticks on Itanium and should spin in
466
// 2*(n+14) on "wider" IA-64 implementations (to be verified with new
467
// ļæ½-architecture manuals as they become available). As usual it's
468
// possible to compress the epilogue, down to 10 in this case, at the
469
// cost of scalability. Compressed (and therefore non-scalable) loop
470
// running at 3*(n+11) would buy you ~10% on Itanium but take ~35%
471
// from "wider" IA-64 so let it be scalable! Special attention was
472
// paid for having the loop body split at 64-byte boundary. ld8 is
473
// scheduled for L1 cache as the data is more than likely there.
474
// Indeed, bn_mul_words has put it there a moment ago:-)
475
.L_bn_mul_add_words_ctop:
476
{ .mfi; (p25) getf.sig r36=f52 // low
477
(p21) xmpy.lu f48=f37,f8
478
(p28) cmp.ltu p54,p50=r41,r39 }
479
{ .mfi; (p16) ldf8 f32=[r15],8
480
(p21) xmpy.hu f40=f37,f8
481
(p28) add r45=r45,r41 };;
482
{ .mii; (p25) getf.sig r32=f44 // high
483
.pred.rel "mutex",p50,p54
484
(p50) add r40=r38,r35 // (p27)
485
(p54) add r40=r38,r35,1 } // (p27)
486
{ .mfb; (p28) cmp.ltu.unc p60,p0=r45,r41
489
{ .mii; (p27) ld8 r44=[r18],8
490
(p62) cmp.eq.or p61,p0=-1,r46
491
(p62) add r46=1,r46 }
492
{ .mfb; (p30) st8 [r14]=r47,8
494
br.ctop.sptk .L_bn_mul_add_words_ctop};;
495
.L_bn_mul_add_words_cend:
498
.pred.rel "mutex",p53,p57
500
(p57) add r8=r38,r0,1 }
508
{ .mfb; rum 1<<5 // clear um.mfh
510
br.ret.sptk.many b0 };;
511
.endp bn_mul_add_words#
516
// void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num)
518
.global bn_sqr_words#
521
.skip 32 // makes the loop body aligned at 64-byte boundary
526
{ .mii; alloc r2=ar.pfs,3,0,0,0
528
{ .mii; cmp.le p6,p0=r34,r0
529
mov r8=r0 } // return value
531
(p6) br.ret.spnt.many b0 };;
534
{ .mii; sub r10=r34,r0,1
539
#if defined(_HPUX_SOURCE) && defined(_ILP32)
540
{ .mii; addp4 r32=0,r32
545
brp.loop.imp .L_bn_sqr_words_ctop,.L_bn_sqr_words_cend-16
547
{ .mii; add r34=8,r32
551
// 2*(n+17) on Itanium, (n+17) on "wider" IA-64 implementations. It's
552
// possible to compress the epilogue (I'm getting tired to write this
553
// comment over and over) and get down to 2*n+16 at the cost of
554
// scalability. The decision will very likely be reconsidered after the
555
// benchmark program is profiled. I.e. if perfomance gain on Itanium
556
// will appear larger than loss on "wider" IA-64, then the loop should
557
// be explicitely split and the epilogue compressed.
558
.L_bn_sqr_words_ctop:
559
{ .mfi; (p16) ldf8 f32=[r33],8
560
(p25) xmpy.lu f42=f41,f41
562
{ .mib; (p33) stf8 [r32]=f50,16
565
{ .mfi; (p0) nop.m 0x0
566
(p25) xmpy.hu f52=f41,f41
568
{ .mib; (p33) stf8 [r34]=f60,16
570
br.ctop.sptk .L_bn_sqr_words_ctop };;
571
.L_bn_sqr_words_cend:
576
{ .mfb; rum 1<<5 // clear um.mfh
578
br.ret.sptk.many b0 };;
583
// Apparently we win nothing by implementing special bn_sqr_comba8.
584
// Yes, it is possible to reduce the number of multiplications by
585
// almost factor of two, but then the amount of additions would
586
// increase by factor of two (as we would have to perform those
587
// otherwise performed by xma ourselves). Normally we would trade
588
// anyway as multiplications are way more expensive, but not this
589
// time... Multiplication kernel is fully pipelined and as we drain
590
// one 128-bit multiplication result per clock cycle multiplications
591
// are effectively as inexpensive as additions. Special implementation
592
// might become of interest for "wider" IA-64 implementation as you'll
593
// be able to get through the multiplication phase faster (there won't
594
// be any stall issues as discussed in the commentary section below and
595
// you therefore will be able to employ all 4 FP units)... But these
596
// Itanium days it's simply too hard to justify the effort so I just
597
// drop down to bn_mul_comba8 code:-)
599
// void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
601
.global bn_sqr_comba8#
608
#if defined(_HPUX_SOURCE) && defined(_ILP32)
609
{ .mii; alloc r2=ar.pfs,2,1,0,0
614
{ .mii; alloc r2=ar.pfs,2,1,0,0
619
{ .mii; add r17=8,r34
622
{ .mfb; add r16=24,r33
623
br .L_cheat_entry_point8 };;
628
// I've estimated this routine to run in ~120 ticks, but in reality
629
// (i.e. according to ar.itc) it takes ~160 ticks. Are those extra
630
// cycles consumed for instructions fetch? Or did I misinterpret some
631
// clause in Itanium ļæ½-architecture manual? Comments are welcomed and
632
// highly appreciated.
634
// However! It should be noted that even 160 ticks is darn good result
635
// as it's over 10 (yes, ten, spelled as t-e-n) times faster than the
636
// C version (compiled with gcc with inline assembler). I really
637
// kicked compiler's butt here, didn't I? Yeah! This brings us to the
638
// following statement. It's damn shame that this routine isn't called
639
// very often nowadays! According to the profiler most CPU time is
640
// consumed by bn_mul_add_words called from BN_from_montgomery. In
641
// order to estimate what we're missing, I've compared the performance
642
// of this routine against "traditional" implementation, i.e. against
643
// following routine:
645
// void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
646
// { r[ 8]=bn_mul_words( &(r[0]),a,8,b[0]);
647
// r[ 9]=bn_mul_add_words(&(r[1]),a,8,b[1]);
648
// r[10]=bn_mul_add_words(&(r[2]),a,8,b[2]);
649
// r[11]=bn_mul_add_words(&(r[3]),a,8,b[3]);
650
// r[12]=bn_mul_add_words(&(r[4]),a,8,b[4]);
651
// r[13]=bn_mul_add_words(&(r[5]),a,8,b[5]);
652
// r[14]=bn_mul_add_words(&(r[6]),a,8,b[6]);
653
// r[15]=bn_mul_add_words(&(r[7]),a,8,b[7]);
656
// The one below is over 8 times faster than the one above:-( Even
657
// more reasons to "combafy" bn_mul_add_mont...
659
// And yes, this routine really made me wish there were an optimizing
660
// assembler! It also feels like it deserves a dedication.
662
// To my wife for being there and to my kids...
664
// void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
669
.global bn_mul_comba8#
676
#if defined(_HPUX_SOURCE) && defined(_ILP32)
677
{ .mii; alloc r2=ar.pfs,3,0,0,0
680
{ .mii; addp4 r32=0,r32
682
{ .mii; alloc r2=ar.pfs,3,0,0,0
687
{ .mii; add r15=16,r33
690
.L_cheat_entry_point8:
691
{ .mmi; add r19=24,r34
693
ldf8 f32=[r33],32 };;
695
{ .mmi; ldf8 f120=[r34],32
697
{ .mmi; ldf8 f122=[r18],32
698
ldf8 f123=[r19],32 };;
699
{ .mmi; ldf8 f124=[r34]
701
{ .mmi; ldf8 f126=[r18]
704
{ .mmi; ldf8 f33=[r14],32
706
{ .mmi; ldf8 f35=[r16],32;;
708
{ .mmi; ldf8 f37=[r14]
710
{ .mfi; ldf8 f39=[r16]
711
// -------\ Entering multiplier's heaven /-------
712
// ------------\ /------------
713
// -----------------\ /-----------------
714
// ----------------------\/----------------------
715
xma.hu f41=f32,f120,f0 }
716
{ .mfi; xma.lu f40=f32,f120,f0 };; // (*)
717
{ .mfi; xma.hu f51=f32,f121,f0 }
718
{ .mfi; xma.lu f50=f32,f121,f0 };;
719
{ .mfi; xma.hu f61=f32,f122,f0 }
720
{ .mfi; xma.lu f60=f32,f122,f0 };;
721
{ .mfi; xma.hu f71=f32,f123,f0 }
722
{ .mfi; xma.lu f70=f32,f123,f0 };;
723
{ .mfi; xma.hu f81=f32,f124,f0 }
724
{ .mfi; xma.lu f80=f32,f124,f0 };;
725
{ .mfi; xma.hu f91=f32,f125,f0 }
726
{ .mfi; xma.lu f90=f32,f125,f0 };;
727
{ .mfi; xma.hu f101=f32,f126,f0 }
728
{ .mfi; xma.lu f100=f32,f126,f0 };;
729
{ .mfi; xma.hu f111=f32,f127,f0 }
730
{ .mfi; xma.lu f110=f32,f127,f0 };;//
731
// (*) You can argue that splitting at every second bundle would
732
// prevent "wider" IA-64 implementations from achieving the peak
733
// performance. Well, not really... The catch is that if you
734
// intend to keep 4 FP units busy by splitting at every fourth
735
// bundle and thus perform these 16 multiplications in 4 ticks,
736
// the first bundle *below* would stall because the result from
737
// the first xma bundle *above* won't be available for another 3
738
// ticks (if not more, being an optimist, I assume that "wider"
739
// implementation will have same latency:-). This stall will hold
740
// you back and the performance would be as if every second bundle
741
// were split *anyway*...
742
{ .mfi; getf.sig r16=f40
743
xma.hu f42=f33,f120,f41
745
{ .mfi; xma.lu f41=f33,f120,f41 };;
746
{ .mfi; getf.sig r24=f50
747
xma.hu f52=f33,f121,f51 }
748
{ .mfi; xma.lu f51=f33,f121,f51 };;
749
{ .mfi; st8 [r32]=r16,16
750
xma.hu f62=f33,f122,f61 }
751
{ .mfi; xma.lu f61=f33,f122,f61 };;
752
{ .mfi; xma.hu f72=f33,f123,f71 }
753
{ .mfi; xma.lu f71=f33,f123,f71 };;
754
{ .mfi; xma.hu f82=f33,f124,f81 }
755
{ .mfi; xma.lu f81=f33,f124,f81 };;
756
{ .mfi; xma.hu f92=f33,f125,f91 }
757
{ .mfi; xma.lu f91=f33,f125,f91 };;
758
{ .mfi; xma.hu f102=f33,f126,f101 }
759
{ .mfi; xma.lu f101=f33,f126,f101 };;
760
{ .mfi; xma.hu f112=f33,f127,f111 }
761
{ .mfi; xma.lu f111=f33,f127,f111 };;//
762
//-------------------------------------------------//
763
{ .mfi; getf.sig r25=f41
764
xma.hu f43=f34,f120,f42 }
765
{ .mfi; xma.lu f42=f34,f120,f42 };;
766
{ .mfi; getf.sig r16=f60
767
xma.hu f53=f34,f121,f52 }
768
{ .mfi; xma.lu f52=f34,f121,f52 };;
769
{ .mfi; getf.sig r17=f51
770
xma.hu f63=f34,f122,f62
772
{ .mfi; xma.lu f62=f34,f122,f62
774
{ .mfi; cmp.ltu p6,p0=r25,r24
775
xma.hu f73=f34,f123,f72 }
776
{ .mfi; xma.lu f72=f34,f123,f72 };;
777
{ .mfi; st8 [r33]=r25,16
778
xma.hu f83=f34,f124,f82
779
(p6) add carry1=1,carry1 }
780
{ .mfi; xma.lu f82=f34,f124,f82 };;
781
{ .mfi; xma.hu f93=f34,f125,f92 }
782
{ .mfi; xma.lu f92=f34,f125,f92 };;
783
{ .mfi; xma.hu f103=f34,f126,f102 }
784
{ .mfi; xma.lu f102=f34,f126,f102 };;
785
{ .mfi; xma.hu f113=f34,f127,f112 }
786
{ .mfi; xma.lu f112=f34,f127,f112 };;//
787
//-------------------------------------------------//
788
{ .mfi; getf.sig r18=f42
789
xma.hu f44=f35,f120,f43
791
{ .mfi; xma.lu f43=f35,f120,f43 };;
792
{ .mfi; getf.sig r24=f70
793
xma.hu f54=f35,f121,f53 }
795
xma.lu f53=f35,f121,f53 };;
796
{ .mfi; getf.sig r25=f61
797
xma.hu f64=f35,f122,f63
798
cmp.ltu p7,p0=r17,r16 }
799
{ .mfi; add r18=r18,r17
800
xma.lu f63=f35,f122,f63 };;
801
{ .mfi; getf.sig r26=f52
802
xma.hu f74=f35,f123,f73
803
(p7) add carry2=1,carry2 }
804
{ .mfi; cmp.ltu p7,p0=r18,r17
805
xma.lu f73=f35,f123,f73
806
add r18=r18,carry1 };;
808
xma.hu f84=f35,f124,f83
809
(p7) add carry2=1,carry2 }
810
{ .mfi; cmp.ltu p7,p0=r18,carry1
811
xma.lu f83=f35,f124,f83 };;
812
{ .mfi; st8 [r32]=r18,16
813
xma.hu f94=f35,f125,f93
814
(p7) add carry2=1,carry2 }
815
{ .mfi; xma.lu f93=f35,f125,f93 };;
816
{ .mfi; xma.hu f104=f35,f126,f103 }
817
{ .mfi; xma.lu f103=f35,f126,f103 };;
818
{ .mfi; xma.hu f114=f35,f127,f113 }
820
xma.lu f113=f35,f127,f113
821
add r25=r25,r24 };;//
822
//-------------------------------------------------//
823
{ .mfi; getf.sig r27=f43
824
xma.hu f45=f36,f120,f44
825
cmp.ltu p6,p0=r25,r24 }
826
{ .mfi; xma.lu f44=f36,f120,f44
828
{ .mfi; getf.sig r16=f80
829
xma.hu f55=f36,f121,f54
830
(p6) add carry1=1,carry1 }
831
{ .mfi; xma.lu f54=f36,f121,f54 };;
832
{ .mfi; getf.sig r17=f71
833
xma.hu f65=f36,f122,f64
834
cmp.ltu p6,p0=r26,r25 }
835
{ .mfi; xma.lu f64=f36,f122,f64
837
{ .mfi; getf.sig r18=f62
838
xma.hu f75=f36,f123,f74
839
(p6) add carry1=1,carry1 }
840
{ .mfi; cmp.ltu p6,p0=r27,r26
841
xma.lu f74=f36,f123,f74
842
add r27=r27,carry2 };;
843
{ .mfi; getf.sig r19=f53
844
xma.hu f85=f36,f124,f84
845
(p6) add carry1=1,carry1 }
846
{ .mfi; xma.lu f84=f36,f124,f84
847
cmp.ltu p6,p0=r27,carry2 };;
848
{ .mfi; st8 [r33]=r27,16
849
xma.hu f95=f36,f125,f94
850
(p6) add carry1=1,carry1 }
851
{ .mfi; xma.lu f94=f36,f125,f94 };;
852
{ .mfi; xma.hu f105=f36,f126,f104 }
854
xma.lu f104=f36,f126,f104
856
{ .mfi; xma.hu f115=f36,f127,f114
857
cmp.ltu p7,p0=r17,r16 }
858
{ .mfi; xma.lu f114=f36,f127,f114
859
add r18=r18,r17 };;//
860
//-------------------------------------------------//
861
{ .mfi; getf.sig r20=f44
862
xma.hu f46=f37,f120,f45
863
(p7) add carry2=1,carry2 }
864
{ .mfi; cmp.ltu p7,p0=r18,r17
865
xma.lu f45=f37,f120,f45
867
{ .mfi; getf.sig r24=f90
868
xma.hu f56=f37,f121,f55 }
869
{ .mfi; xma.lu f55=f37,f121,f55 };;
870
{ .mfi; getf.sig r25=f81
871
xma.hu f66=f37,f122,f65
872
(p7) add carry2=1,carry2 }
873
{ .mfi; cmp.ltu p7,p0=r19,r18
874
xma.lu f65=f37,f122,f65
876
{ .mfi; getf.sig r26=f72
877
xma.hu f76=f37,f123,f75
878
(p7) add carry2=1,carry2 }
879
{ .mfi; cmp.ltu p7,p0=r20,r19
880
xma.lu f75=f37,f123,f75
881
add r20=r20,carry1 };;
882
{ .mfi; getf.sig r27=f63
883
xma.hu f86=f37,f124,f85
884
(p7) add carry2=1,carry2 }
885
{ .mfi; xma.lu f85=f37,f124,f85
886
cmp.ltu p7,p0=r20,carry1 };;
887
{ .mfi; getf.sig r28=f54
888
xma.hu f96=f37,f125,f95
889
(p7) add carry2=1,carry2 }
890
{ .mfi; st8 [r32]=r20,16
891
xma.lu f95=f37,f125,f95 };;
892
{ .mfi; xma.hu f106=f37,f126,f105 }
894
xma.lu f105=f37,f126,f105
896
{ .mfi; xma.hu f116=f37,f127,f115
897
cmp.ltu p6,p0=r25,r24 }
898
{ .mfi; xma.lu f115=f37,f127,f115
899
add r26=r26,r25 };;//
900
//-------------------------------------------------//
901
{ .mfi; getf.sig r29=f45
902
xma.hu f47=f38,f120,f46
903
(p6) add carry1=1,carry1 }
904
{ .mfi; cmp.ltu p6,p0=r26,r25
905
xma.lu f46=f38,f120,f46
907
{ .mfi; getf.sig r16=f100
908
xma.hu f57=f38,f121,f56
909
(p6) add carry1=1,carry1 }
910
{ .mfi; cmp.ltu p6,p0=r27,r26
911
xma.lu f56=f38,f121,f56
913
{ .mfi; getf.sig r17=f91
914
xma.hu f67=f38,f122,f66
915
(p6) add carry1=1,carry1 }
916
{ .mfi; cmp.ltu p6,p0=r28,r27
917
xma.lu f66=f38,f122,f66
919
{ .mfi; getf.sig r18=f82
920
xma.hu f77=f38,f123,f76
921
(p6) add carry1=1,carry1 }
922
{ .mfi; cmp.ltu p6,p0=r29,r28
923
xma.lu f76=f38,f123,f76
924
add r29=r29,carry2 };;
925
{ .mfi; getf.sig r19=f73
926
xma.hu f87=f38,f124,f86
927
(p6) add carry1=1,carry1 }
928
{ .mfi; xma.lu f86=f38,f124,f86
929
cmp.ltu p6,p0=r29,carry2 };;
930
{ .mfi; getf.sig r20=f64
931
xma.hu f97=f38,f125,f96
932
(p6) add carry1=1,carry1 }
933
{ .mfi; st8 [r33]=r29,16
934
xma.lu f96=f38,f125,f96 };;
935
{ .mfi; getf.sig r21=f55
936
xma.hu f107=f38,f126,f106 }
938
xma.lu f106=f38,f126,f106
940
{ .mfi; xma.hu f117=f38,f127,f116
941
cmp.ltu p7,p0=r17,r16 }
942
{ .mfi; xma.lu f116=f38,f127,f116
943
add r18=r18,r17 };;//
944
//-------------------------------------------------//
945
{ .mfi; getf.sig r22=f46
946
xma.hu f48=f39,f120,f47
947
(p7) add carry2=1,carry2 }
948
{ .mfi; cmp.ltu p7,p0=r18,r17
949
xma.lu f47=f39,f120,f47
951
{ .mfi; getf.sig r24=f110
952
xma.hu f58=f39,f121,f57
953
(p7) add carry2=1,carry2 }
954
{ .mfi; cmp.ltu p7,p0=r19,r18
955
xma.lu f57=f39,f121,f57
957
{ .mfi; getf.sig r25=f101
958
xma.hu f68=f39,f122,f67
959
(p7) add carry2=1,carry2 }
960
{ .mfi; cmp.ltu p7,p0=r20,r19
961
xma.lu f67=f39,f122,f67
963
{ .mfi; getf.sig r26=f92
964
xma.hu f78=f39,f123,f77
965
(p7) add carry2=1,carry2 }
966
{ .mfi; cmp.ltu p7,p0=r21,r20
967
xma.lu f77=f39,f123,f77
969
{ .mfi; getf.sig r27=f83
970
xma.hu f88=f39,f124,f87
971
(p7) add carry2=1,carry2 }
972
{ .mfi; cmp.ltu p7,p0=r22,r21
973
xma.lu f87=f39,f124,f87
974
add r22=r22,carry1 };;
975
{ .mfi; getf.sig r28=f74
976
xma.hu f98=f39,f125,f97
977
(p7) add carry2=1,carry2 }
978
{ .mfi; xma.lu f97=f39,f125,f97
979
cmp.ltu p7,p0=r22,carry1 };;
980
{ .mfi; getf.sig r29=f65
981
xma.hu f108=f39,f126,f107
982
(p7) add carry2=1,carry2 }
983
{ .mfi; st8 [r32]=r22,16
984
xma.lu f107=f39,f126,f107 };;
985
{ .mfi; getf.sig r30=f56
986
xma.hu f118=f39,f127,f117 }
987
{ .mfi; xma.lu f117=f39,f127,f117 };;//
988
//-------------------------------------------------//
989
// Leaving muliplier's heaven... Quite a ride, huh?
991
{ .mii; getf.sig r31=f47
994
{ .mii; getf.sig r16=f111
995
cmp.ltu p6,p0=r25,r24
997
{ .mfb; getf.sig r17=f102 }
999
(p6) add carry1=1,carry1
1000
cmp.ltu p6,p0=r26,r25
1004
(p6) add carry1=1,carry1
1005
cmp.ltu p6,p0=r27,r26
1007
{ .mii; getf.sig r18=f93
1011
(p6) add carry1=1,carry1
1012
cmp.ltu p6,p0=r28,r27
1014
{ .mii; getf.sig r19=f84
1015
cmp.ltu p7,p0=r17,r16 }
1017
(p6) add carry1=1,carry1
1018
cmp.ltu p6,p0=r29,r28
1020
{ .mii; getf.sig r20=f75
1023
(p6) add carry1=1,carry1
1024
cmp.ltu p6,p0=r30,r29
1026
{ .mfb; getf.sig r21=f66 }
1027
{ .mii; (p7) add carry3=1,carry3
1028
cmp.ltu p7,p0=r18,r17
1032
(p6) add carry1=1,carry1
1033
cmp.ltu p6,p0=r31,r30
1034
add r31=r31,carry2 };;
1035
{ .mfb; getf.sig r22=f57 }
1036
{ .mii; (p7) add carry3=1,carry3
1037
cmp.ltu p7,p0=r19,r18
1041
(p6) add carry1=1,carry1
1042
cmp.ltu p6,p0=r31,carry2 };;
1043
{ .mfb; getf.sig r23=f48 }
1044
{ .mii; (p7) add carry3=1,carry3
1045
cmp.ltu p7,p0=r20,r19
1048
(p6) add carry1=1,carry1 }
1049
{ .mfb; st8 [r33]=r31,16 };;
1051
{ .mfb; getf.sig r24=f112 }
1052
{ .mii; (p7) add carry3=1,carry3
1053
cmp.ltu p7,p0=r21,r20
1055
{ .mfb; getf.sig r25=f103 }
1056
{ .mii; (p7) add carry3=1,carry3
1057
cmp.ltu p7,p0=r22,r21
1059
{ .mfb; getf.sig r26=f94 }
1060
{ .mii; (p7) add carry3=1,carry3
1061
cmp.ltu p7,p0=r23,r22
1062
add r23=r23,carry1 };;
1063
{ .mfb; getf.sig r27=f85 }
1064
{ .mii; (p7) add carry3=1,carry3
1065
cmp.ltu p7,p8=r23,carry1};;
1066
{ .mii; getf.sig r28=f76
1069
{ .mii; st8 [r32]=r23,16
1070
(p7) add carry2=1,carry3
1071
(p8) add carry2=0,carry3 };;
1074
{ .mii; getf.sig r29=f67
1075
cmp.ltu p6,p0=r25,r24
1077
{ .mfb; getf.sig r30=f58 }
1079
(p6) add carry1=1,carry1
1080
cmp.ltu p6,p0=r26,r25
1082
{ .mfb; getf.sig r16=f113 }
1084
(p6) add carry1=1,carry1
1085
cmp.ltu p6,p0=r27,r26
1087
{ .mfb; getf.sig r17=f104 }
1089
(p6) add carry1=1,carry1
1090
cmp.ltu p6,p0=r28,r27
1092
{ .mfb; getf.sig r18=f95 }
1094
(p6) add carry1=1,carry1
1095
cmp.ltu p6,p0=r29,r28
1097
{ .mii; getf.sig r19=f86
1101
(p6) add carry1=1,carry1
1102
cmp.ltu p6,p0=r30,r29
1103
add r30=r30,carry2 };;
1104
{ .mii; getf.sig r20=f77
1105
cmp.ltu p7,p0=r17,r16
1108
(p6) add carry1=1,carry1
1109
cmp.ltu p6,p0=r30,carry2 };;
1110
{ .mfb; getf.sig r21=f68 }
1111
{ .mii; st8 [r33]=r30,16
1112
(p6) add carry1=1,carry1 };;
1114
{ .mfb; getf.sig r24=f114 }
1115
{ .mii; (p7) add carry3=1,carry3
1116
cmp.ltu p7,p0=r18,r17
1118
{ .mfb; getf.sig r25=f105 }
1119
{ .mii; (p7) add carry3=1,carry3
1120
cmp.ltu p7,p0=r19,r18
1122
{ .mfb; getf.sig r26=f96 }
1123
{ .mii; (p7) add carry3=1,carry3
1124
cmp.ltu p7,p0=r20,r19
1126
{ .mfb; getf.sig r27=f87 }
1127
{ .mii; (p7) add carry3=1,carry3
1128
cmp.ltu p7,p0=r21,r20
1129
add r21=r21,carry1 };;
1130
{ .mib; getf.sig r28=f78
1132
{ .mib; (p7) add carry3=1,carry3
1133
cmp.ltu p7,p8=r21,carry1};;
1134
{ .mii; st8 [r32]=r21,16
1135
(p7) add carry2=1,carry3
1136
(p8) add carry2=0,carry3 }
1138
{ .mii; mov carry1=0
1139
cmp.ltu p6,p0=r25,r24
1141
{ .mfb; getf.sig r16=f115 }
1143
(p6) add carry1=1,carry1
1144
cmp.ltu p6,p0=r26,r25
1146
{ .mfb; getf.sig r17=f106 }
1148
(p6) add carry1=1,carry1
1149
cmp.ltu p6,p0=r27,r26
1151
{ .mfb; getf.sig r18=f97 }
1153
(p6) add carry1=1,carry1
1154
cmp.ltu p6,p0=r28,r27
1155
add r28=r28,carry2 };;
1156
{ .mib; getf.sig r19=f88
1159
(p6) add carry1=1,carry1
1160
cmp.ltu p6,p0=r28,carry2 };;
1161
{ .mii; st8 [r33]=r28,16
1162
(p6) add carry1=1,carry1 }
1164
{ .mii; mov carry2=0
1165
cmp.ltu p7,p0=r17,r16
1167
{ .mfb; getf.sig r24=f116 }
1168
{ .mii; (p7) add carry2=1,carry2
1169
cmp.ltu p7,p0=r18,r17
1171
{ .mfb; getf.sig r25=f107 }
1172
{ .mii; (p7) add carry2=1,carry2
1173
cmp.ltu p7,p0=r19,r18
1174
add r19=r19,carry1 };;
1175
{ .mfb; getf.sig r26=f98 }
1176
{ .mii; (p7) add carry2=1,carry2
1177
cmp.ltu p7,p0=r19,carry1};;
1178
{ .mii; st8 [r32]=r19,16
1179
(p7) add carry2=1,carry2 }
1181
{ .mfb; add r25=r25,r24 };;
1183
{ .mfb; getf.sig r16=f117 }
1184
{ .mii; mov carry1=0
1185
cmp.ltu p6,p0=r25,r24
1187
{ .mfb; getf.sig r17=f108 }
1189
(p6) add carry1=1,carry1
1190
cmp.ltu p6,p0=r26,r25
1191
add r26=r26,carry2 };;
1194
(p6) add carry1=1,carry1
1195
cmp.ltu p6,p0=r26,carry2 };;
1196
{ .mii; st8 [r33]=r26,16
1197
(p6) add carry1=1,carry1 }
1199
{ .mfb; add r17=r17,r16 };;
1200
{ .mfb; getf.sig r24=f118 }
1201
{ .mii; mov carry2=0
1202
cmp.ltu p7,p0=r17,r16
1203
add r17=r17,carry1 };;
1204
{ .mii; (p7) add carry2=1,carry2
1205
cmp.ltu p7,p0=r17,carry1};;
1206
{ .mii; st8 [r32]=r17
1207
(p7) add carry2=1,carry2 };;
1208
{ .mfb; add r24=r24,carry2 };;
1209
{ .mib; st8 [r33]=r24 }
1211
{ .mib; rum 1<<5 // clear um.mfh
1212
br.ret.sptk.many b0 };;
1213
.endp bn_mul_comba8#
1220
// It's possible to make it faster (see comment to bn_sqr_comba8), but
1221
// I reckon it doesn't worth the effort. Basically because the routine
1222
// (actually both of them) practically never called... So I just play
1223
// same trick as with bn_sqr_comba8.
1225
// void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
1227
.global bn_sqr_comba4#
1228
.proc bn_sqr_comba4#
1234
#if defined(_HPUX_SOURCE) && defined(_ILP32)
1235
{ .mii; alloc r2=ar.pfs,2,1,0,0
1240
{ .mii; alloc r2=ar.pfs,2,1,0,0
1245
{ .mii; add r17=8,r34
1248
{ .mfb; add r16=24,r33
1249
br .L_cheat_entry_point4 };;
1250
.endp bn_sqr_comba4#
1254
// Runs in ~115 cycles and ~4.5 times faster than C. Well, whatever...
1256
// void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
1260
.global bn_mul_comba4#
1261
.proc bn_mul_comba4#
1267
#if defined(_HPUX_SOURCE) && defined(_ILP32)
1268
{ .mii; alloc r2=ar.pfs,3,0,0,0
1271
{ .mii; addp4 r32=0,r32
1273
{ .mii; alloc r2=ar.pfs,3,0,0,0
1278
{ .mii; add r15=16,r33
1281
.L_cheat_entry_point4:
1282
{ .mmi; add r19=24,r34
1286
{ .mmi; ldf8 f120=[r34]
1288
{ .mmi; ldf8 f122=[r18]
1291
{ .mmi; ldf8 f33=[r14]
1293
{ .mfi; ldf8 f35=[r16]
1295
xma.hu f41=f32,f120,f0 }
1296
{ .mfi; xma.lu f40=f32,f120,f0 };;
1297
{ .mfi; xma.hu f51=f32,f121,f0 }
1298
{ .mfi; xma.lu f50=f32,f121,f0 };;
1299
{ .mfi; xma.hu f61=f32,f122,f0 }
1300
{ .mfi; xma.lu f60=f32,f122,f0 };;
1301
{ .mfi; xma.hu f71=f32,f123,f0 }
1302
{ .mfi; xma.lu f70=f32,f123,f0 };;//
1303
// Major stall takes place here, and 3 more places below. Result from
1304
// first xma is not available for another 3 ticks.
1305
{ .mfi; getf.sig r16=f40
1306
xma.hu f42=f33,f120,f41
1308
{ .mfi; xma.lu f41=f33,f120,f41 };;
1309
{ .mfi; getf.sig r24=f50
1310
xma.hu f52=f33,f121,f51 }
1311
{ .mfi; xma.lu f51=f33,f121,f51 };;
1312
{ .mfi; st8 [r32]=r16,16
1313
xma.hu f62=f33,f122,f61 }
1314
{ .mfi; xma.lu f61=f33,f122,f61 };;
1315
{ .mfi; xma.hu f72=f33,f123,f71 }
1316
{ .mfi; xma.lu f71=f33,f123,f71 };;//
1317
//-------------------------------------------------//
1318
{ .mfi; getf.sig r25=f41
1319
xma.hu f43=f34,f120,f42 }
1320
{ .mfi; xma.lu f42=f34,f120,f42 };;
1321
{ .mfi; getf.sig r16=f60
1322
xma.hu f53=f34,f121,f52 }
1323
{ .mfi; xma.lu f52=f34,f121,f52 };;
1324
{ .mfi; getf.sig r17=f51
1325
xma.hu f63=f34,f122,f62
1327
{ .mfi; mov carry1=0
1328
xma.lu f62=f34,f122,f62 };;
1329
{ .mfi; st8 [r33]=r25,16
1330
xma.hu f73=f34,f123,f72
1331
cmp.ltu p6,p0=r25,r24 }
1332
{ .mfi; xma.lu f72=f34,f123,f72 };;//
1333
//-------------------------------------------------//
1334
{ .mfi; getf.sig r18=f42
1335
xma.hu f44=f35,f120,f43
1336
(p6) add carry1=1,carry1 }
1337
{ .mfi; add r17=r17,r16
1338
xma.lu f43=f35,f120,f43
1340
{ .mfi; getf.sig r24=f70
1341
xma.hu f54=f35,f121,f53
1342
cmp.ltu p7,p0=r17,r16 }
1343
{ .mfi; xma.lu f53=f35,f121,f53 };;
1344
{ .mfi; getf.sig r25=f61
1345
xma.hu f64=f35,f122,f63
1347
{ .mfi; xma.lu f63=f35,f122,f63
1348
(p7) add carry2=1,carry2 };;
1349
{ .mfi; getf.sig r26=f52
1350
xma.hu f74=f35,f123,f73
1351
cmp.ltu p7,p0=r18,r17 }
1352
{ .mfi; xma.lu f73=f35,f123,f73
1353
add r18=r18,carry1 };;
1354
//-------------------------------------------------//
1355
{ .mii; st8 [r32]=r18,16
1356
(p7) add carry2=1,carry2
1357
cmp.ltu p7,p0=r18,carry1 };;
1359
{ .mfi; getf.sig r27=f43 // last major stall
1360
(p7) add carry2=1,carry2 };;
1361
{ .mii; getf.sig r16=f71
1364
{ .mii; getf.sig r17=f62
1365
cmp.ltu p6,p0=r25,r24
1368
(p6) add carry1=1,carry1
1369
cmp.ltu p6,p0=r26,r25
1372
(p6) add carry1=1,carry1
1373
cmp.ltu p6,p0=r27,r26
1374
add r27=r27,carry2 };;
1375
{ .mii; getf.sig r18=f53
1376
(p6) add carry1=1,carry1
1377
cmp.ltu p6,p0=r27,carry2 };;
1378
{ .mfi; st8 [r33]=r27,16
1379
(p6) add carry1=1,carry1 }
1381
{ .mii; getf.sig r19=f44
1384
{ .mii; getf.sig r24=f72
1385
cmp.ltu p7,p0=r17,r16
1387
{ .mii; (p7) add carry2=1,carry2
1388
cmp.ltu p7,p0=r18,r17
1390
{ .mii; (p7) add carry2=1,carry2
1391
cmp.ltu p7,p0=r19,r18
1392
add r19=r19,carry1 };;
1393
{ .mii; getf.sig r25=f63
1394
(p7) add carry2=1,carry2
1395
cmp.ltu p7,p0=r19,carry1};;
1396
{ .mii; st8 [r32]=r19,16
1397
(p7) add carry2=1,carry2 }
1399
{ .mii; getf.sig r26=f54
1402
{ .mii; getf.sig r16=f73
1403
cmp.ltu p6,p0=r25,r24
1406
(p6) add carry1=1,carry1
1407
cmp.ltu p6,p0=r26,r25
1408
add r26=r26,carry2 };;
1409
{ .mii; getf.sig r17=f64
1410
(p6) add carry1=1,carry1
1411
cmp.ltu p6,p0=r26,carry2 };;
1412
{ .mii; st8 [r33]=r26,16
1413
(p6) add carry1=1,carry1 }
1415
{ .mii; getf.sig r24=f74
1418
{ .mii; cmp.ltu p7,p0=r17,r16
1419
add r17=r17,carry1 };;
1421
{ .mii; (p7) add carry2=1,carry2
1422
cmp.ltu p7,p0=r17,carry1};;
1423
{ .mii; st8 [r32]=r17,16
1424
(p7) add carry2=1,carry2 };;
1426
{ .mii; add r24=r24,carry2 };;
1427
{ .mii; st8 [r33]=r24 }
1429
{ .mib; rum 1<<5 // clear um.mfh
1430
br.ret.sptk.many b0 };;
1431
.endp bn_mul_comba4#
1438
// BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
1440
// In the nutshell it's a port of my MIPS III/IV implementation.
1451
// Some preprocessors (most notably HP-UX) apper to be allergic to
1452
// macros enclosed to parenthesis as these three will be.
1454
#define break p0 // p20
1463
.global bn_div_words#
1471
{ .mii; alloc r2=ar.pfs,3,5,0,8
1474
{ .mmb; cmp.eq p6,p0=r34,r0
1476
(p6) br.ret.spnt.many b0 };;
1479
{ .mii; mov H=r32 // save h
1480
mov ar.ec=0 // don't rotate at exit
1482
{ .mii; mov L=r33 // save l
1485
.L_divw_shift: // -vv- note signed comparison
1486
{ .mfi; (p0) cmp.lt p16,p0=r0,r34 // d
1487
(p0) shladd r33=r34,1,r0 }
1488
{ .mfb; (p0) add r35=1,r36
1490
(p16) br.wtop.dpnt .L_divw_shift };;
1495
{ .mii; setf.sig f7=DH
1498
{ .mib; cmp.ne p6,p0=r0,AT
1500
(p6) br.call.spnt.clr b0=abort };; // overflow, die...
1502
{ .mfi; fcvt.xuf.s1 f7=f7
1511
{ .mlx; setf.sig f14=D
1512
movl AT=0xffffffff };;
1513
///////////////////////////////////////////////////////////
1514
{ .mii; setf.sig f6=H
1516
cmp.eq p6,p7=HH,DH };;
1519
(p7) fcvt.xuf.s1 f6=f6
1520
(p7) br.call.sptk b6=.L_udiv64_32_b6 };;
1522
{ .mfi; getf.sig r33=f8 // q
1524
{ .mfi; xmpy.hu f10=f8,f14
1527
{ .mmi; getf.sig r35=f9 // tl
1528
getf.sig r31=f10 };; // th
1531
{ .mii; (p0) add r32=-1,r33
1532
(p0) cmp.eq equ,cont=HH,r31 };;
1533
{ .mii; (p0) cmp.ltu p8,p0=r35,D
1535
(equ) cmp.leu break,cont=r35,H };;
1536
{ .mib; (cont) cmp.leu cont,break=HH,r31
1538
(cont) br.wtop.spnt .L_divw_1st_iter };;
1539
///////////////////////////////////////////////////////////
1543
///////////////////////////////////////////////////////////
1544
{ .mii; setf.sig f6=H
1546
cmp.eq p6,p7=HH,DH };;
1549
(p7) fcvt.xuf.s1 f6=f6
1550
(p7) br.call.sptk b6=.L_udiv64_32_b6 };;
1552
{ .mfi; getf.sig r33=f8 // q
1554
{ .mfi; xmpy.hu f10=f8,f14
1557
{ .mmi; getf.sig r35=f9 // tl
1558
getf.sig r31=f10 };; // th
1561
{ .mii; (p0) add r32=-1,r33
1562
(p0) cmp.eq equ,cont=HH,r31 };;
1563
{ .mii; (p0) cmp.ltu p8,p0=r35,D
1565
(equ) cmp.leu break,cont=r35,H };;
1566
{ .mib; (cont) cmp.leu cont,break=HH,r31
1568
(cont) br.wtop.spnt .L_divw_2nd_iter };;
1569
///////////////////////////////////////////////////////////
1573
{ .mii; shr.u r9=H,I // remainder if anybody wants it
1574
mov pr=r10,0x1ffff }
1575
{ .mfb; br.ret.sptk.many b0 };;
1577
// Unsigned 64 by 32 (well, by 64 for the moment) bit integer division
1580
// inputs: f6 = (double)a, f7 = (double)b
1581
// output: f8 = (int)(a/b)
1582
// clobbered: f8,f9,f10,f11,pred
1584
// This procedure is essentially Intel code and therefore is
1585
// copyrighted to Intel Corporation (I suppose...). It's sligtly
1586
// modified for specific needs.
1590
frcpa.s1 f8,pred=f6,f7;; // [0] y0 = 1 / b
1592
(pred) fnma.s1 f9=f7,f8,f1 // [5] e0 = 1 - b * y0
1593
(pred) fmpy.s1 f10=f6,f8;; // [5] q0 = a * y0
1594
(pred) fmpy.s1 f11=f9,f9 // [10] e1 = e0 * e0
1595
(pred) fma.s1 f10=f9,f10,f10;; // [10] q1 = q0 + e0 * q0
1596
(pred) fma.s1 f8=f9,f8,f8 //;; // [15] y1 = y0 + e0 * y0
1597
(pred) fma.s1 f9=f11,f10,f10;; // [15] q2 = q1 + e1 * q1
1598
(pred) fma.s1 f8=f11,f8,f8 //;; // [20] y2 = y1 + e1 * y1
1599
(pred) fnma.s1 f10=f7,f9,f6;; // [20] r2 = a - b * q2
1600
(pred) fma.s1 f8=f10,f8,f9;; // [25] q3 = q2 + r2 * y2
1602
fcvt.fxu.trunc.s1 f8=f8 // [30] q = trunc(q3)
1603
br.ret.sptk.many b6;;