3
* This is a stupid adaptation of ATL_cmm4x4x128_av.c to complex. I was
4
* forced to add it, because the newest gcc does terrible with intrinsics
5
* for some reason (8Gflop rather than 12). In order to get something to
6
* work, I simply wrote a code to do all access of C using the scalar FPU,
7
* meaning that this kernel will do an extra store of vector C to memory
8
* for load by the scalar FPU.
10
#include "atlas_asm.h"
12
#if !defined(ATL_AS_OSX_PPC) && !defined(ATL_GAS_LINUX_PPC)
13
#error "This kernel requires OS X or Linux PPC assembler!"
15
#if !defined(KB) || KB == 0
16
#error "This kernel requires KB be a compile-time constant!"
19
#error "This kernel requires KB < 92!"
24
#ifdef ATL_GAS_LINUX_PPC
25
#define ATL_ReadVRSAVE(r_) mfvrsave r_
26
#define ATL_WriteVRSAVE(r_) mtvrsave r_
28
#define ATL_ReadVRSAVE(r_) mfspr r_, VRsave
29
#define ATL_WriteVRSAVE(r_) mtspr VRsave, r_
33
#define CMUL(i_) ((i_)*2)
63
#elif defined(ATL_USE64BITS)
98
#if defined(ATL_USE64BITS)
105
#define BOFF FSIZE-16
161
*******************************************************************************
162
32 bit ABIs: (linux in parenthesis)
164
void ATL_USERMM(const int M, const int N, const int K, const TYPE alpha,
165
(r6) r7 (r7) r8 (r8) r9 (r9) r10
166
const TYPE *A, const int lda, const TYPE *B, const int ldb,
168
const TYPE beta, TYPE *C, const int ldc)
170
*******************************************************************************
173
void ATL_USERMM(const int M, const int N, const int K, const TYPE alpha,
175
const TYPE *A, const int lda, const TYPE *B, const int ldb,
177
const TYPE beta, TYPE *C, const int ldc)
180
#ifdef ATL_AS_OSX_PPC
181
.globl Mjoin(_,ATL_USERMM)
184
#if defined(ATL_USE64BITS)
186
* Official Program Descripter section, seg fault w/o it on Linux/PPC64
193
.quad Mjoin(.,ATL_USERMM),.TOC.@tocbase,0
195
.type Mjoin(.,ATL_USERMM),@function
197
.globl Mjoin(.,ATL_USERMM)
205
#if defined(ATL_USE64BITS)
255
vxor v0, v0, v0 /* zero v0 */
256
mtvscr v0 /* force IEEE compliance */
258
eqv r0, r0, r0 /* all 1s */
259
ATL_WriteVRSAVE(r0) /* signal we use all vector regs */
260
#if defined (ATL_USE64BITS)
261
ld pC0, FSIZE+120(r1)
262
ld ldc, FSIZE+128(r1)
263
#elif defined(ATL_AS_OSX_PPC)
264
lwz pC0, FSIZE+60(r1)
265
lwz ldc, FSIZE+64(r1)
269
slwi ldc, ldc, SHF /* ldc = ldc*sizeof */
270
slwi pfA, M, SHF /* pfA = M*sizeof() */
272
sub incCn, incCn, pfA /* incCn = ldc*4 - M */
273
mulli incAn, M, KB*4 /* incAn = M*KB*sizeof() */
274
add pfA, pA0, incAn /* pfA = A + M*KB */
275
srwi M, M, 2 /* M /= 4 */
277
// pA0 = pA0 - incAn + KB4*4 = pA0 -(incAn - KB*4)
279
addi incAn, k1, -KB4*4
280
addi incCn, incCn, incCm
306
vxor vC33, vC33, vC33
321
#if MB == 0 || MB > 4
330
vmaddfp vC00, vA0, vB0, vC33
334
vmaddfp vC10, vA1, vB0, vC33
338
vmaddfp vC20, vA2, vB0, vC33
342
vmaddfp vC30, vA3, vB0, vC33
346
vmaddfp vC01, vA0, vB1, vC33
347
vmaddfp vC11, vA1, vB1, vC33
349
vmaddfp vC21, vA2, vB1, vC33
351
vmaddfp vC31, vA3, vB1, vC33
355
vmaddfp vC02, vA0, vB2, vC33
356
vmaddfp vC12, vA1, vB2, vC33
357
vmaddfp vC22, vA2, vB2, vC33
358
vmaddfp vC32, vA3, vB2, vC33
362
vmaddfp vC03, vA0, vB3, vC33
366
vmaddfp vC13, vA1, vB3, vC33
370
vmaddfp vC23, vA2, vB3, vC33
374
vmaddfp vC33, vA3, vB3, vC33
381
#endif /* end K=0 block */
386
vmaddfp vC00, va0, vb0, vC00
390
vmaddfp vC10, va1, vb0, vC10
394
vmaddfp vC20, va2, vb0, vC20
398
vmaddfp vC30, va3, vb0, vC30
402
vmaddfp vC01, va0, vb1, vC01
403
vmaddfp vC11, va1, vb1, vC11
404
vmaddfp vC21, va2, vb1, vC21
405
vmaddfp vC31, va3, vb1, vC31
409
vmaddfp vC02, va0, vb2, vC02
410
vmaddfp vC12, va1, vb2, vC12
411
vmaddfp vC22, va2, vb2, vC22
412
vmaddfp vC32, va3, vb2, vC32
416
vmaddfp vC03, va0, vb3, vC03
420
vmaddfp vC13, va1, vb3, vC13
424
vmaddfp vC23, va2, vb3, vC23
428
vmaddfp vC33, va3, vb3, vC33
435
#endif /* end K=4 block */
440
vmaddfp vC00, vA0, vB0, vC00
444
vmaddfp vC10, vA1, vB0, vC10
448
vmaddfp vC20, vA2, vB0, vC20
452
vmaddfp vC30, vA3, vB0, vC30
456
vmaddfp vC01, vA0, vB1, vC01
457
vmaddfp vC11, vA1, vB1, vC11
458
vmaddfp vC21, vA2, vB1, vC21
459
vmaddfp vC31, vA3, vB1, vC31
463
vmaddfp vC02, vA0, vB2, vC02
464
vmaddfp vC12, vA1, vB2, vC12
465
vmaddfp vC22, vA2, vB2, vC22
466
vmaddfp vC32, vA3, vB2, vC32
470
vmaddfp vC03, vA0, vB3, vC03
474
vmaddfp vC13, vA1, vB3, vC13
478
vmaddfp vC23, vA2, vB3, vC23
482
vmaddfp vC33, vA3, vB3, vC33
489
#endif /* end K=8 block */
494
vmaddfp vC00, va0, vb0, vC00
498
vmaddfp vC10, va1, vb0, vC10
502
vmaddfp vC20, va2, vb0, vC20
506
vmaddfp vC30, va3, vb0, vC30
510
vmaddfp vC01, va0, vb1, vC01
511
vmaddfp vC11, va1, vb1, vC11
512
vmaddfp vC21, va2, vb1, vC21
513
vmaddfp vC31, va3, vb1, vC31
517
vmaddfp vC02, va0, vb2, vC02
518
vmaddfp vC12, va1, vb2, vC12
519
vmaddfp vC22, va2, vb2, vC22
520
vmaddfp vC32, va3, vb2, vC32
524
vmaddfp vC03, va0, vb3, vC03
528
vmaddfp vC13, va1, vb3, vC13
532
vmaddfp vC23, va2, vb3, vC23
536
vmaddfp vC33, va3, vb3, vC33
543
#endif /* end K=12 block */
548
vmaddfp vC00, vA0, vB0, vC00
552
vmaddfp vC10, vA1, vB0, vC10
556
vmaddfp vC20, vA2, vB0, vC20
560
vmaddfp vC30, vA3, vB0, vC30
564
vmaddfp vC01, vA0, vB1, vC01
565
vmaddfp vC11, vA1, vB1, vC11
566
vmaddfp vC21, vA2, vB1, vC21
567
vmaddfp vC31, vA3, vB1, vC31
571
vmaddfp vC02, vA0, vB2, vC02
572
vmaddfp vC12, vA1, vB2, vC12
573
vmaddfp vC22, vA2, vB2, vC22
574
vmaddfp vC32, vA3, vB2, vC32
578
vmaddfp vC03, vA0, vB3, vC03
582
vmaddfp vC13, vA1, vB3, vC13
586
vmaddfp vC23, vA2, vB3, vC23
590
vmaddfp vC33, vA3, vB3, vC33
597
#endif /* end K=16 block */
602
vmaddfp vC00, va0, vb0, vC00
606
vmaddfp vC10, va1, vb0, vC10
610
vmaddfp vC20, va2, vb0, vC20
614
vmaddfp vC30, va3, vb0, vC30
618
vmaddfp vC01, va0, vb1, vC01
619
vmaddfp vC11, va1, vb1, vC11
620
vmaddfp vC21, va2, vb1, vC21
621
vmaddfp vC31, va3, vb1, vC31
625
vmaddfp vC02, va0, vb2, vC02
626
vmaddfp vC12, va1, vb2, vC12
627
vmaddfp vC22, va2, vb2, vC22
628
vmaddfp vC32, va3, vb2, vC32
632
vmaddfp vC03, va0, vb3, vC03
636
vmaddfp vC13, va1, vb3, vC13
640
vmaddfp vC23, va2, vb3, vC23
644
vmaddfp vC33, va3, vb3, vC33
651
#endif /* end K=20 block */
656
vmaddfp vC00, vA0, vB0, vC00
660
vmaddfp vC10, vA1, vB0, vC10
664
vmaddfp vC20, vA2, vB0, vC20
668
vmaddfp vC30, vA3, vB0, vC30
672
vmaddfp vC01, vA0, vB1, vC01
673
vmaddfp vC11, vA1, vB1, vC11
674
vmaddfp vC21, vA2, vB1, vC21
675
vmaddfp vC31, vA3, vB1, vC31
679
vmaddfp vC02, vA0, vB2, vC02
680
vmaddfp vC12, vA1, vB2, vC12
681
vmaddfp vC22, vA2, vB2, vC22
682
vmaddfp vC32, vA3, vB2, vC32
686
vmaddfp vC03, vA0, vB3, vC03
690
vmaddfp vC13, vA1, vB3, vC13
694
vmaddfp vC23, vA2, vB3, vC23
698
vmaddfp vC33, vA3, vB3, vC33
705
#endif /* end K=24 block */
710
vmaddfp vC00, va0, vb0, vC00
714
vmaddfp vC10, va1, vb0, vC10
718
vmaddfp vC20, va2, vb0, vC20
722
vmaddfp vC30, va3, vb0, vC30
726
vmaddfp vC01, va0, vb1, vC01
727
vmaddfp vC11, va1, vb1, vC11
728
vmaddfp vC21, va2, vb1, vC21
729
vmaddfp vC31, va3, vb1, vC31
733
vmaddfp vC02, va0, vb2, vC02
734
vmaddfp vC12, va1, vb2, vC12
736
vmaddfp vC22, va2, vb2, vC22
738
vmaddfp vC32, va3, vb2, vC32
742
vmaddfp vC03, va0, vb3, vC03
746
vmaddfp vC13, va1, vb3, vC13
750
vmaddfp vC23, va2, vb3, vC23
754
vmaddfp vC33, va3, vb3, vC33
761
#endif /* end K=28 block */
766
vmaddfp vC00, vA0, vB0, vC00
770
vmaddfp vC10, vA1, vB0, vC10
774
vmaddfp vC20, vA2, vB0, vC20
778
vmaddfp vC30, vA3, vB0, vC30
782
vmaddfp vC01, vA0, vB1, vC01
783
vmaddfp vC11, vA1, vB1, vC11
784
vmaddfp vC21, vA2, vB1, vC21
785
vmaddfp vC31, vA3, vB1, vC31
789
vmaddfp vC02, vA0, vB2, vC02
790
vmaddfp vC12, vA1, vB2, vC12
791
vmaddfp vC22, vA2, vB2, vC22
792
vmaddfp vC32, vA3, vB2, vC32
796
vmaddfp vC03, vA0, vB3, vC03
800
vmaddfp vC13, vA1, vB3, vC13
804
vmaddfp vC23, vA2, vB3, vC23
808
vmaddfp vC33, vA3, vB3, vC33
815
#endif /* end K=32 block */
820
vmaddfp vC00, va0, vb0, vC00
824
vmaddfp vC10, va1, vb0, vC10
828
vmaddfp vC20, va2, vb0, vC20
832
vmaddfp vC30, va3, vb0, vC30
836
vmaddfp vC01, va0, vb1, vC01
837
vmaddfp vC11, va1, vb1, vC11
838
vmaddfp vC21, va2, vb1, vC21
839
vmaddfp vC31, va3, vb1, vC31
843
vmaddfp vC02, va0, vb2, vC02
844
vmaddfp vC12, va1, vb2, vC12
845
vmaddfp vC22, va2, vb2, vC22
846
vmaddfp vC32, va3, vb2, vC32
850
vmaddfp vC03, va0, vb3, vC03
854
vmaddfp vC13, va1, vb3, vC13
858
vmaddfp vC23, va2, vb3, vC23
862
vmaddfp vC33, va3, vb3, vC33
869
#endif /* end K=36 block */
874
vmaddfp vC00, vA0, vB0, vC00
878
vmaddfp vC10, vA1, vB0, vC10
882
vmaddfp vC20, vA2, vB0, vC20
886
vmaddfp vC30, vA3, vB0, vC30
890
vmaddfp vC01, vA0, vB1, vC01
891
vmaddfp vC11, vA1, vB1, vC11
892
vmaddfp vC21, vA2, vB1, vC21
893
vmaddfp vC31, vA3, vB1, vC31
897
vmaddfp vC02, vA0, vB2, vC02
898
vmaddfp vC12, vA1, vB2, vC12
899
vmaddfp vC22, vA2, vB2, vC22
900
vmaddfp vC32, vA3, vB2, vC32
904
vmaddfp vC03, vA0, vB3, vC03
908
vmaddfp vC13, vA1, vB3, vC13
912
vmaddfp vC23, vA2, vB3, vC23
916
vmaddfp vC33, vA3, vB3, vC33
923
#endif /* end K=40 block */
928
vmaddfp vC00, va0, vb0, vC00
932
vmaddfp vC10, va1, vb0, vC10
936
vmaddfp vC20, va2, vb0, vC20
940
vmaddfp vC30, va3, vb0, vC30
944
vmaddfp vC01, va0, vb1, vC01
945
vmaddfp vC11, va1, vb1, vC11
946
vmaddfp vC21, va2, vb1, vC21
947
vmaddfp vC31, va3, vb1, vC31
951
vmaddfp vC02, va0, vb2, vC02
952
vmaddfp vC12, va1, vb2, vC12
953
vmaddfp vC22, va2, vb2, vC22
954
vmaddfp vC32, va3, vb2, vC32
958
vmaddfp vC03, va0, vb3, vC03
962
vmaddfp vC13, va1, vb3, vC13
966
vmaddfp vC23, va2, vb3, vC23
970
vmaddfp vC33, va3, vb3, vC33
977
#endif /* end K=44 block */
982
vmaddfp vC00, vA0, vB0, vC00
986
vmaddfp vC10, vA1, vB0, vC10
990
vmaddfp vC20, vA2, vB0, vC20
994
vmaddfp vC30, vA3, vB0, vC30
998
vmaddfp vC01, vA0, vB1, vC01
999
vmaddfp vC11, vA1, vB1, vC11
1000
vmaddfp vC21, vA2, vB1, vC21
1001
vmaddfp vC31, vA3, vB1, vC31
1005
vmaddfp vC02, vA0, vB2, vC02
1006
vmaddfp vC12, vA1, vB2, vC12
1007
vmaddfp vC22, vA2, vB2, vC22
1008
vmaddfp vC32, vA3, vB2, vC32
1012
vmaddfp vC03, vA0, vB3, vC03
1016
vmaddfp vC13, vA1, vB3, vC13
1020
vmaddfp vC23, vA2, vB3, vC23
1024
vmaddfp vC33, vA3, vB3, vC33
1031
#endif /* end K=48 block */
1036
vmaddfp vC00, va0, vb0, vC00
1040
vmaddfp vC10, va1, vb0, vC10
1044
vmaddfp vC20, va2, vb0, vC20
1048
vmaddfp vC30, va3, vb0, vC30
1052
vmaddfp vC01, va0, vb1, vC01
1053
vmaddfp vC11, va1, vb1, vC11
1054
vmaddfp vC21, va2, vb1, vC21
1055
vmaddfp vC31, va3, vb1, vC31
1059
vmaddfp vC02, va0, vb2, vC02
1060
vmaddfp vC12, va1, vb2, vC12
1061
vmaddfp vC22, va2, vb2, vC22
1062
vmaddfp vC32, va3, vb2, vC32
1066
vmaddfp vC03, va0, vb3, vC03
1070
vmaddfp vC13, va1, vb3, vC13
1074
vmaddfp vC23, va2, vb3, vC23
1078
vmaddfp vC33, va3, vb3, vC33
1085
#endif /* end K=52 block */
1090
vmaddfp vC00, vA0, vB0, vC00
1094
vmaddfp vC10, vA1, vB0, vC10
1098
vmaddfp vC20, vA2, vB0, vC20
1102
vmaddfp vC30, vA3, vB0, vC30
1106
vmaddfp vC01, vA0, vB1, vC01
1107
vmaddfp vC11, vA1, vB1, vC11
1108
vmaddfp vC21, vA2, vB1, vC21
1109
vmaddfp vC31, vA3, vB1, vC31
1113
vmaddfp vC02, vA0, vB2, vC02
1114
vmaddfp vC12, vA1, vB2, vC12
1115
vmaddfp vC22, vA2, vB2, vC22
1116
vmaddfp vC32, vA3, vB2, vC32
1120
vmaddfp vC03, vA0, vB3, vC03
1124
vmaddfp vC13, vA1, vB3, vC13
1128
vmaddfp vC23, vA2, vB3, vC23
1132
vmaddfp vC33, vA3, vB3, vC33
1139
#endif /* end K=56 block */
1144
vmaddfp vC00, va0, vb0, vC00
1148
vmaddfp vC10, va1, vb0, vC10
1152
vmaddfp vC20, va2, vb0, vC20
1156
vmaddfp vC30, va3, vb0, vC30
1160
vmaddfp vC01, va0, vb1, vC01
1161
vmaddfp vC11, va1, vb1, vC11
1162
vmaddfp vC21, va2, vb1, vC21
1163
vmaddfp vC31, va3, vb1, vC31
1167
vmaddfp vC02, va0, vb2, vC02
1168
vmaddfp vC12, va1, vb2, vC12
1169
vmaddfp vC22, va2, vb2, vC22
1170
vmaddfp vC32, va3, vb2, vC32
1174
vmaddfp vC03, va0, vb3, vC03
1178
vmaddfp vC13, va1, vb3, vC13
1182
vmaddfp vC23, va2, vb3, vC23
1186
vmaddfp vC33, va3, vb3, vC33
1193
#endif /* end K=60 block */
1198
vmaddfp vC00, vA0, vB0, vC00
1202
vmaddfp vC10, vA1, vB0, vC10
1206
vmaddfp vC20, vA2, vB0, vC20
1210
vmaddfp vC30, vA3, vB0, vC30
1214
vmaddfp vC01, vA0, vB1, vC01
1215
vmaddfp vC11, vA1, vB1, vC11
1216
vmaddfp vC21, vA2, vB1, vC21
1217
vmaddfp vC31, vA3, vB1, vC31
1221
vmaddfp vC02, vA0, vB2, vC02
1222
vmaddfp vC12, vA1, vB2, vC12
1223
vmaddfp vC22, vA2, vB2, vC22
1224
vmaddfp vC32, vA3, vB2, vC32
1228
vmaddfp vC03, vA0, vB3, vC03
1232
vmaddfp vC13, vA1, vB3, vC13
1236
vmaddfp vC23, vA2, vB3, vC23
1240
vmaddfp vC33, vA3, vB3, vC33
1247
#endif /* end K=64 block */
1252
vmaddfp vC00, va0, vb0, vC00
1256
vmaddfp vC10, va1, vb0, vC10
1260
vmaddfp vC20, va2, vb0, vC20
1264
vmaddfp vC30, va3, vb0, vC30
1268
vmaddfp vC01, va0, vb1, vC01
1269
vmaddfp vC11, va1, vb1, vC11
1270
vmaddfp vC21, va2, vb1, vC21
1271
vmaddfp vC31, va3, vb1, vC31
1275
vmaddfp vC02, va0, vb2, vC02
1276
vmaddfp vC12, va1, vb2, vC12
1277
vmaddfp vC22, va2, vb2, vC22
1278
vmaddfp vC32, va3, vb2, vC32
1282
vmaddfp vC03, va0, vb3, vC03
1286
vmaddfp vC13, va1, vb3, vC13
1290
vmaddfp vC23, va2, vb3, vC23
1294
vmaddfp vC33, va3, vb3, vC33
1301
#endif /* end K=68 block */
1306
vmaddfp vC00, vA0, vB0, vC00
1310
vmaddfp vC10, vA1, vB0, vC10
1314
vmaddfp vC20, vA2, vB0, vC20
1318
vmaddfp vC30, vA3, vB0, vC30
1322
vmaddfp vC01, vA0, vB1, vC01
1326
vmaddfp vC11, vA1, vB1, vC11
1327
vmaddfp vC21, vA2, vB1, vC21
1328
vmaddfp vC31, vA3, vB1, vC31
1332
vmaddfp vC02, vA0, vB2, vC02
1333
vmaddfp vC12, vA1, vB2, vC12
1334
vmaddfp vC22, vA2, vB2, vC22
1335
vmaddfp vC32, vA3, vB2, vC32
1339
vmaddfp vC03, vA0, vB3, vC03
1343
vmaddfp vC13, vA1, vB3, vC13
1347
vmaddfp vC23, vA2, vB3, vC23
1351
vmaddfp vC33, vA3, vB3, vC33
1358
#endif /* end K=72 block */
1363
vmaddfp vC00, va0, vb0, vC00
1367
vmaddfp vC10, va1, vb0, vC10
1371
vmaddfp vC20, va2, vb0, vC20
1375
vmaddfp vC30, va3, vb0, vC30
1379
vmaddfp vC01, va0, vb1, vC01
1380
vmaddfp vC11, va1, vb1, vC11
1381
vmaddfp vC21, va2, vb1, vC21
1382
vmaddfp vC31, va3, vb1, vC31
1386
vmaddfp vC02, va0, vb2, vC02
1387
vmaddfp vC12, va1, vb2, vC12
1388
vmaddfp vC22, va2, vb2, vC22
1389
vmaddfp vC32, va3, vb2, vC32
1393
vmaddfp vC03, va0, vb3, vC03
1397
vmaddfp vC13, va1, vb3, vC13
1401
vmaddfp vC23, va2, vb3, vC23
1405
vmaddfp vC33, va3, vb3, vC33
1412
#endif /* end K=76 block */
1417
vmaddfp vC00, vA0, vB0, vC00
1421
vmaddfp vC10, vA1, vB0, vC10
1425
vmaddfp vC20, vA2, vB0, vC20
1429
vmaddfp vC30, vA3, vB0, vC30
1433
vmaddfp vC01, vA0, vB1, vC01
1434
vmaddfp vC11, vA1, vB1, vC11
1435
vmaddfp vC21, vA2, vB1, vC21
1436
vmaddfp vC31, vA3, vB1, vC31
1440
vmaddfp vC02, vA0, vB2, vC02
1441
vmaddfp vC12, vA1, vB2, vC12
1442
vmaddfp vC22, vA2, vB2, vC22
1443
vmaddfp vC32, vA3, vB2, vC32
1447
vmaddfp vC03, vA0, vB3, vC03
1451
vmaddfp vC13, vA1, vB3, vC13
1455
vmaddfp vC23, vA2, vB3, vC23
1459
vmaddfp vC33, vA3, vB3, vC33
1466
#endif /* end K=80 block */
1471
vmaddfp vC00, va0, vb0, vC00
1475
vmaddfp vC10, va1, vb0, vC10
1479
vmaddfp vC20, va2, vb0, vC20
1483
vmaddfp vC30, va3, vb0, vC30
1487
vmaddfp vC01, va0, vb1, vC01
1488
vmaddfp vC11, va1, vb1, vC11
1489
vmaddfp vC21, va2, vb1, vC21
1490
vmaddfp vC31, va3, vb1, vC31
1494
vmaddfp vC02, va0, vb2, vC02
1495
vmaddfp vC12, va1, vb2, vC12
1496
vmaddfp vC22, va2, vb2, vC22
1497
vmaddfp vC32, va3, vb2, vC32
1501
vmaddfp vC03, va0, vb3, vC03
1505
vmaddfp vC13, va1, vb3, vC13
1509
vmaddfp vC23, va2, vb3, vC23
1513
vmaddfp vC33, va3, vb3, vC33
1520
#endif /* end K=84 block */
1521
#if KB > 88 /* HERE HERE */
1525
vmaddfp vC00, vA0, vB0, vC00
1529
vmaddfp vC10, vA1, vB0, vC10
1533
vmaddfp vC20, vA2, vB0, vC20
1537
vmaddfp vC30, vA3, vB0, vC30
1541
vmaddfp vC01, vA0, vB1, vC01
1542
vmaddfp vC11, vA1, vB1, vC11
1543
vmaddfp vC21, vA2, vB1, vC21
1544
vmaddfp vC31, vA3, vB1, vC31
1548
vmaddfp vC02, vA0, vB2, vC02
1549
vmaddfp vC12, vA1, vB2, vC12
1550
vmaddfp vC22, vA2, vB2, vC22
1551
vmaddfp vC32, vA3, vB2, vC32
1555
vmaddfp vC03, vA0, vB3, vC03
1559
vmaddfp vC13, vA1, vB3, vC13
1563
vmaddfp vC23, vA2, vB3, vC23
1567
vmaddfp vC33, vA3, vB3, vC33
1574
#endif /* end K=88 block */
1579
vmaddfp vC00, va0, vb0, vC00
1583
vmaddfp vC10, va1, vb0, vC10
1587
vmaddfp vC20, va2, vb0, vC20
1591
vmaddfp vC30, va3, vb0, vC30
1595
vmaddfp vC01, va0, vb1, vC01
1596
vmaddfp vC11, va1, vb1, vC11
1597
vmaddfp vC21, va2, vb1, vC21
1598
vmaddfp vC31, va3, vb1, vC31
1602
vmaddfp vC02, va0, vb2, vC02
1603
vmaddfp vC12, va1, vb2, vC12
1604
vmaddfp vC22, va2, vb2, vC22
1605
vmaddfp vC32, va3, vb2, vC32
1609
vmaddfp vC03, va0, vb3, vC03
1613
vmaddfp vC13, va1, vb3, vC13
1617
vmaddfp vC23, va2, vb3, vC23
1621
vmaddfp vC33, va3, vb3, vC33
1628
#endif /* end K=92 block */
1633
vmaddfp vC00, vA0, vB0, vC00
1637
vmaddfp vC10, vA1, vB0, vC10
1641
vmaddfp vC20, vA2, vB0, vC20
1645
vmaddfp vC30, vA3, vB0, vC30
1649
vmaddfp vC01, vA0, vB1, vC01
1650
vmaddfp vC11, vA1, vB1, vC11
1651
vmaddfp vC21, vA2, vB1, vC21
1652
vmaddfp vC31, vA3, vB1, vC31
1656
vmaddfp vC02, vA0, vB2, vC02
1657
vmaddfp vC12, vA1, vB2, vC12
1658
vmaddfp vC22, vA2, vB2, vC22
1659
vmaddfp vC32, vA3, vB2, vC32
1663
vmaddfp vC03, vA0, vB3, vC03
1667
vmaddfp vC13, vA1, vB3, vC13
1671
vmaddfp vC23, vA2, vB3, vC23
1675
vmaddfp vC33, vA3, vB3, vC33
1682
#endif /* end K=96 block */
1687
vmaddfp vC00, va0, vb0, vC00
1691
vmaddfp vC10, va1, vb0, vC10
1695
vmaddfp vC20, va2, vb0, vC20
1699
vmaddfp vC30, va3, vb0, vC30
1703
vmaddfp vC01, va0, vb1, vC01
1704
vmaddfp vC11, va1, vb1, vC11
1705
vmaddfp vC21, va2, vb1, vC21
1706
vmaddfp vC31, va3, vb1, vC31
1710
vmaddfp vC02, va0, vb2, vC02
1711
vmaddfp vC12, va1, vb2, vC12
1712
vmaddfp vC22, va2, vb2, vC22
1713
vmaddfp vC32, va3, vb2, vC32
1717
vmaddfp vC03, va0, vb3, vC03
1721
vmaddfp vC13, va1, vb3, vC13
1725
vmaddfp vC23, va2, vb3, vC23
1729
vmaddfp vC33, va3, vb3, vC33
1736
#endif /* end K=100 block */
1741
vmaddfp vC00, vA0, vB0, vC00
1745
vmaddfp vC10, vA1, vB0, vC10
1749
vmaddfp vC20, vA2, vB0, vC20
1753
vmaddfp vC30, vA3, vB0, vC30
1757
vmaddfp vC01, vA0, vB1, vC01
1758
vmaddfp vC11, vA1, vB1, vC11
1759
vmaddfp vC21, vA2, vB1, vC21
1760
vmaddfp vC31, vA3, vB1, vC31
1764
vmaddfp vC02, vA0, vB2, vC02
1765
vmaddfp vC12, vA1, vB2, vC12
1766
vmaddfp vC22, vA2, vB2, vC22
1767
vmaddfp vC32, vA3, vB2, vC32
1771
vmaddfp vC03, vA0, vB3, vC03
1775
vmaddfp vC13, vA1, vB3, vC13
1779
vmaddfp vC23, vA2, vB3, vC23
1783
vmaddfp vC33, vA3, vB3, vC33
1790
#endif /* end K=104 block */
1795
vmaddfp vC00, va0, vb0, vC00
1799
vmaddfp vC10, va1, vb0, vC10
1803
vmaddfp vC20, va2, vb0, vC20
1807
vmaddfp vC30, va3, vb0, vC30
1811
vmaddfp vC01, va0, vb1, vC01
1812
vmaddfp vC11, va1, vb1, vC11
1813
vmaddfp vC21, va2, vb1, vC21
1814
vmaddfp vC31, va3, vb1, vC31
1818
vmaddfp vC02, va0, vb2, vC02
1819
vmaddfp vC12, va1, vb2, vC12
1820
vmaddfp vC22, va2, vb2, vC22
1821
vmaddfp vC32, va3, vb2, vC32
1825
vmaddfp vC03, va0, vb3, vC03
1829
vmaddfp vC13, va1, vb3, vC13
1833
vmaddfp vC23, va2, vb3, vC23
1837
vmaddfp vC33, va3, vb3, vC33
1844
#endif /* end K=108 block */
1849
vmaddfp vC00, vA0, vB0, vC00
1853
vmaddfp vC10, vA1, vB0, vC10
1857
vmaddfp vC20, vA2, vB0, vC20
1861
vmaddfp vC30, vA3, vB0, vC30
1865
vmaddfp vC01, vA0, vB1, vC01
1866
vmaddfp vC11, vA1, vB1, vC11
1867
vmaddfp vC21, vA2, vB1, vC21
1868
vmaddfp vC31, vA3, vB1, vC31
1872
vmaddfp vC02, vA0, vB2, vC02
1873
vmaddfp vC12, vA1, vB2, vC12
1874
vmaddfp vC22, vA2, vB2, vC22
1875
vmaddfp vC32, vA3, vB2, vC32
1879
vmaddfp vC03, vA0, vB3, vC03
1883
vmaddfp vC13, vA1, vB3, vC13
1887
vmaddfp vC23, vA2, vB3, vC23
1891
vmaddfp vC33, vA3, vB3, vC33
1898
#endif /* end K=112 block */
1903
vmaddfp vC00, va0, vb0, vC00
1907
vmaddfp vC10, va1, vb0, vC10
1911
vmaddfp vC20, va2, vb0, vC20
1915
vmaddfp vC30, va3, vb0, vC30
1919
vmaddfp vC01, va0, vb1, vC01
1920
vmaddfp vC11, va1, vb1, vC11
1921
vmaddfp vC21, va2, vb1, vC21
1922
vmaddfp vC31, va3, vb1, vC31
1926
vmaddfp vC02, va0, vb2, vC02
1927
vmaddfp vC12, va1, vb2, vC12
1928
vmaddfp vC22, va2, vb2, vC22
1929
vmaddfp vC32, va3, vb2, vC32
1933
vmaddfp vC03, va0, vb3, vC03
1937
vmaddfp vC13, va1, vb3, vC13
1941
vmaddfp vC23, va2, vb3, vC23
1945
vmaddfp vC33, va3, vb3, vC33
1952
#endif /* end K=116 block */
1957
vmaddfp vC00, vA0, vB0, vC00
1961
vmaddfp vC10, vA1, vB0, vC10
1965
vmaddfp vC20, vA2, vB0, vC20
1969
vmaddfp vC30, vA3, vB0, vC30
1973
vmaddfp vC01, vA0, vB1, vC01
1974
vmaddfp vC11, vA1, vB1, vC11
1975
vmaddfp vC21, vA2, vB1, vC21
1976
vmaddfp vC31, vA3, vB1, vC31
1980
vmaddfp vC02, vA0, vB2, vC02
1981
vmaddfp vC12, vA1, vB2, vC12
1982
vmaddfp vC22, vA2, vB2, vC22
1983
vmaddfp vC32, vA3, vB2, vC32
1987
vmaddfp vC03, vA0, vB3, vC03
1991
vmaddfp vC13, vA1, vB3, vC13
1995
vmaddfp vC23, vA2, vB3, vC23
1999
vmaddfp vC33, vA3, vB3, vC33
2006
#endif /* end K=120 block */
2011
vmaddfp vC00, va0, vb0, vC00
2015
vmaddfp vC10, va1, vb0, vC10
2019
vmaddfp vC20, va2, vb0, vC20
2023
vmaddfp vC30, va3, vb0, vC30
2027
vmaddfp vC01, va0, vb1, vC01
2028
vmaddfp vC11, va1, vb1, vC11
2029
vmaddfp vC21, va2, vb1, vC21
2030
vmaddfp vC31, va3, vb1, vC31
2034
vmaddfp vC02, va0, vb2, vC02
2035
vmaddfp vC12, va1, vb2, vC12
2036
vmaddfp vC22, va2, vb2, vC22
2037
vmaddfp vC32, va3, vb2, vC32
2041
vmaddfp vC03, va0, vb3, vC03
2045
vmaddfp vC13, va1, vb3, vC13
2049
vmaddfp vC23, va2, vb3, vC23
2053
vmaddfp vC33, va3, vb3, vC33
2060
#endif /* end K=124 block */
2062
vspltisb vb0, 8 /* vb0={8,8,8,8,8,8,8,8,8, 8, 8, 8, 8, 8, 8} */
2063
addi pA0, pA0, KB4*4 /* pA0 += 4*lda */
2065
vxor vb1, vb1, vb1 /* vb1={0...0} */
2067
vsldoi vb0, vb1, vb0, 8 /* vb0={0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8} */
2069
lvsl vb1, 0, r1 /*vb1={0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}*/
2071
vaddubm vb1, vb0, vb1 /*vb1={0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23}*/
2073
vaddubm vb1, vb0, vb1 /*vb1={0,1,2,3,4,5,6,7,24,25,26,27,28,29,30,31}*/
2077
* Reduce C vectors to scalars
2079
/* rC00 = {c0a, c0b, c0c, c0d} */
2080
/* rC10 = {c1a, c1b, c1c, c1d} */
2081
/* rC20 = {c2a, c2b, c2c, c2d} */
2082
/* rC30 = {c3a, c3b, c3c, c3d} */
2083
vmrglw vb2, vC00, vC10 // vb2 = {c0c, c1c, c0d, c1d}
2085
vmrghw vC00, vC00, vC10 // vC00 = {c0a, c1a, c0b, c1b}
2087
vaddfp vC00, vC00, vb2 // vC00 = {c0ac, c1ac, c0bd, c1bd}
2089
vmrglw vb3, vC20, vC30 // vb3 = {c2c, c3c, c2d, c3d}
2091
vmrghw vC20, vC20, vC30 // vC20 = {c2a, c3a, c2b, c3b}
2093
vaddfp vC20, vC20, vb3 // vC20 = {c2ac, c3ac, c2bd, c3bd}
2095
vperm vb2, vC00, vC20,vb1 // vb2 = {c0ac, c1ac, c2bd, c3bd}
2102
vsldoi vC00, vC00, vC20,8 // vC00 = {c0bd, c1bd, c2ac, c3ac}
2103
vaddfp vC00, vC00, vb2 // vC00 = {c0acbd,c1acbd,c2acbd,c3acbd}
2105
vmrglw vb2, vC01, vC11 // vb2 = {c0c, c1c, c0d, c1d}
2109
vmrghw vC01, vC01, vC11 // vC01 = {c0a, c1a, c0b, c1b}
2110
vaddfp vC01, vC01, vb2 // vC01 = {c0ac, c1ac, c0bd, c1bd}
2111
vmrglw vb3, vC21, vC31 // vb3 = {c2c, c3c, c2d, c3d}
2112
vmrghw vC21, vC21, vC31 // vC21 = {c2a, c3a, c2b, c3b}
2113
vaddfp vC21, vC21, vb3 // vC21 = {c2ac, c3ac, c2bd, c3bd}
2114
vperm vb2, vC01, vC21,vb1 // vb2 = {c0ac, c1ac, c2bd, c3bd}
2115
vsldoi vC01, vC01, vC21,8 // vC01 = {c0bd, c1bd, c2ac, c3ac}
2116
vaddfp vC01, vC01, vb2 // vC01 = {c0acbd,c1acbd,c0acbd,c1acbd}
2121
vmrglw vb2, vC02, vC12 // vb2 = {c0c, c1c, c0d, c1d}
2122
vmrghw vC02, vC02, vC12 // vC02 = {c0a, c1a, c0b, c1b}
2123
vaddfp vC02, vC02, vb2 // vC02 = {c0ac, c1ac, c0bd, c1bd}
2124
vmrglw vb3, vC22, vC32 // vb3 = {c2c, c3c, c2d, c3d}
2125
vmrghw vC22, vC22, vC32 // vC22 = {c2a, c3a, c2b, c3b}
2126
vaddfp vC22, vC22, vb3 // vC22 = {c2ac, c3ac, c2bd, c3bd}
2127
vperm vb2, vC02, vC22,vb1 // vb2 = {c0ac, c1ac, c2bd, c3bd}
2128
vsldoi vC02, vC02, vC22,8 // vC02 = {c0bd, c1bd, c2ac, c3ac}
2129
vaddfp vC02, vC02, vb2 // vC02 = {c0acbd,c1acbd,c0acbd,c1acbd}
2131
vmrglw vb2, vC03, vC13 // vb2 = {c0c, c1c, c0d, c1d}
2132
vmrghw vC03, vC03, vC13 // vC03 = {c0a, c1a, c0b, c1b}
2133
vaddfp vC03, vC03, vb2 // vC03 = {c0ac, c1ac, c0bd, c1bd}
2134
vmrglw vb3, vC23, vC33 // vb3 = {c2c, c3c, c2d, c3d}
2135
vmrghw vC23, vC23, vC33 // vC23 = {c2a, c3a, c2b, c3b}
2136
vxor vC33, vC33, vC33
2137
vaddfp vC23, vC23, vb3 // vC23 = {c2ac, c3ac, c2bd, c3bd}
2138
vperm vb2, vC03, vC23,vb1 // vb2 = {c0ac, c1ac, c2bd, c3bd}
2139
vsldoi vC03, vC03, vC23,8 // vC03 = {c0bd, c1bd, c2ac, c3ac}
2143
vaddfp vC03, vC03, vb2 // vC03 = {c0acbd,c1acbd,c0acbd,c1acbd}
2145
* Store it back out, and add it to C if necessary
2163
fmadd rC0, rc0, rbeta, rC0
2164
fmadd rC1, rc1, rbeta, rC1
2165
fmadd rC2, rc2, rbeta, rC2
2166
fmadd rC3, rc3, rbeta, rC3
2190
fmadd rC0, rc0, rbeta, rC0
2191
fmadd rC1, rc1, rbeta, rC1
2192
fmadd rC2, rc2, rbeta, rC2
2193
fmadd rC3, rc3, rbeta, rC3
2217
fmadd rC0, rc0, rbeta, rC0
2218
fmadd rC1, rc1, rbeta, rC1
2219
fmadd rC2, rc2, rbeta, rC2
2220
fmadd rC3, rc3, rbeta, rC3
2250
fmadd rC0, rc0, rbeta, rC0
2251
fmadd rC1, rc1, rbeta, rC1
2252
fmadd rC2, rc2, rbeta, rC2
2253
fmadd rC3, rc3, rbeta, rC3
2261
* Mov ptrs, while(M)
2269
addi pC0, pC0, incCm
2270
addi pC1, pC1, incCm
2271
addi pC2, pC2, incCm
2272
addi pC3, pC3, incCm
2276
* Last iteration of M-loop unrolled so we can intermix M iterations
2295
vxor vC33, vC33, vC33
2317
vmaddfp vC00, vA0, vB0, vC33
2321
vmaddfp vC10, vA1, vB0, vC33
2325
vmaddfp vC20, vA2, vB0, vC33
2329
vmaddfp vC30, vA3, vB0, vC33
2333
vmaddfp vC01, vA0, vB1, vC33
2334
vmaddfp vC11, vA1, vB1, vC33
2336
vmaddfp vC21, vA2, vB1, vC33
2338
vmaddfp vC31, vA3, vB1, vC33
2342
vmaddfp vC02, vA0, vB2, vC33
2343
vmaddfp vC12, vA1, vB2, vC33
2344
vmaddfp vC22, vA2, vB2, vC33
2345
vmaddfp vC32, vA3, vB2, vC33
2349
vmaddfp vC03, vA0, vB3, vC33
2353
vmaddfp vC13, vA1, vB3, vC33
2357
vmaddfp vC23, vA2, vB3, vC33
2361
vmaddfp vC33, vA3, vB3, vC33
2368
#endif /* end K=0 block */
2373
vmaddfp vC00, va0, vb0, vC00
2377
vmaddfp vC10, va1, vb0, vC10
2381
vmaddfp vC20, va2, vb0, vC20
2385
vmaddfp vC30, va3, vb0, vC30
2389
vmaddfp vC01, va0, vb1, vC01
2390
vmaddfp vC11, va1, vb1, vC11
2391
vmaddfp vC21, va2, vb1, vC21
2392
vmaddfp vC31, va3, vb1, vC31
2396
vmaddfp vC02, va0, vb2, vC02
2397
vmaddfp vC12, va1, vb2, vC12
2398
vmaddfp vC22, va2, vb2, vC22
2399
vmaddfp vC32, va3, vb2, vC32
2403
vmaddfp vC03, va0, vb3, vC03
2407
vmaddfp vC13, va1, vb3, vC13
2411
vmaddfp vC23, va2, vb3, vC23
2415
vmaddfp vC33, va3, vb3, vC33
2422
#endif /* end K=4 block */
2427
vmaddfp vC00, vA0, vB0, vC00
2431
vmaddfp vC10, vA1, vB0, vC10
2435
vmaddfp vC20, vA2, vB0, vC20
2439
vmaddfp vC30, vA3, vB0, vC30
2443
vmaddfp vC01, vA0, vB1, vC01
2444
vmaddfp vC11, vA1, vB1, vC11
2445
vmaddfp vC21, vA2, vB1, vC21
2446
vmaddfp vC31, vA3, vB1, vC31
2450
vmaddfp vC02, vA0, vB2, vC02
2451
vmaddfp vC12, vA1, vB2, vC12
2452
vmaddfp vC22, vA2, vB2, vC22
2453
vmaddfp vC32, vA3, vB2, vC32
2457
vmaddfp vC03, vA0, vB3, vC03
2461
vmaddfp vC13, vA1, vB3, vC13
2465
vmaddfp vC23, vA2, vB3, vC23
2469
vmaddfp vC33, vA3, vB3, vC33
2476
#endif /* end K=8 block */
2481
vmaddfp vC00, va0, vb0, vC00
2485
vmaddfp vC10, va1, vb0, vC10
2489
vmaddfp vC20, va2, vb0, vC20
2493
vmaddfp vC30, va3, vb0, vC30
2497
vmaddfp vC01, va0, vb1, vC01
2498
vmaddfp vC11, va1, vb1, vC11
2499
vmaddfp vC21, va2, vb1, vC21
2500
vmaddfp vC31, va3, vb1, vC31
2504
vmaddfp vC02, va0, vb2, vC02
2505
vmaddfp vC12, va1, vb2, vC12
2506
vmaddfp vC22, va2, vb2, vC22
2507
vmaddfp vC32, va3, vb2, vC32
2511
vmaddfp vC03, va0, vb3, vC03
2515
vmaddfp vC13, va1, vb3, vC13
2519
vmaddfp vC23, va2, vb3, vC23
2523
vmaddfp vC33, va3, vb3, vC33
2530
#endif /* end K=12 block */
2535
vmaddfp vC00, vA0, vB0, vC00
2539
vmaddfp vC10, vA1, vB0, vC10
2543
vmaddfp vC20, vA2, vB0, vC20
2547
vmaddfp vC30, vA3, vB0, vC30
2551
vmaddfp vC01, vA0, vB1, vC01
2552
vmaddfp vC11, vA1, vB1, vC11
2553
vmaddfp vC21, vA2, vB1, vC21
2554
vmaddfp vC31, vA3, vB1, vC31
2558
vmaddfp vC02, vA0, vB2, vC02
2559
vmaddfp vC12, vA1, vB2, vC12
2560
vmaddfp vC22, vA2, vB2, vC22
2561
vmaddfp vC32, vA3, vB2, vC32
2565
vmaddfp vC03, vA0, vB3, vC03
2569
vmaddfp vC13, vA1, vB3, vC13
2573
vmaddfp vC23, vA2, vB3, vC23
2577
vmaddfp vC33, vA3, vB3, vC33
2584
#endif /* end K=16 block */
2589
vmaddfp vC00, va0, vb0, vC00
2593
vmaddfp vC10, va1, vb0, vC10
2597
vmaddfp vC20, va2, vb0, vC20
2601
vmaddfp vC30, va3, vb0, vC30
2605
vmaddfp vC01, va0, vb1, vC01
2606
vmaddfp vC11, va1, vb1, vC11
2607
vmaddfp vC21, va2, vb1, vC21
2608
vmaddfp vC31, va3, vb1, vC31
2612
vmaddfp vC02, va0, vb2, vC02
2613
vmaddfp vC12, va1, vb2, vC12
2614
vmaddfp vC22, va2, vb2, vC22
2615
vmaddfp vC32, va3, vb2, vC32
2619
vmaddfp vC03, va0, vb3, vC03
2623
vmaddfp vC13, va1, vb3, vC13
2627
vmaddfp vC23, va2, vb3, vC23
2631
vmaddfp vC33, va3, vb3, vC33
2638
#endif /* end K=20 block */
2643
vmaddfp vC00, vA0, vB0, vC00
2647
vmaddfp vC10, vA1, vB0, vC10
2651
vmaddfp vC20, vA2, vB0, vC20
2655
vmaddfp vC30, vA3, vB0, vC30
2659
vmaddfp vC01, vA0, vB1, vC01
2660
vmaddfp vC11, vA1, vB1, vC11
2661
vmaddfp vC21, vA2, vB1, vC21
2662
vmaddfp vC31, vA3, vB1, vC31
2666
vmaddfp vC02, vA0, vB2, vC02
2667
vmaddfp vC12, vA1, vB2, vC12
2668
vmaddfp vC22, vA2, vB2, vC22
2669
vmaddfp vC32, vA3, vB2, vC32
2673
vmaddfp vC03, vA0, vB3, vC03
2677
vmaddfp vC13, vA1, vB3, vC13
2681
vmaddfp vC23, vA2, vB3, vC23
2685
vmaddfp vC33, vA3, vB3, vC33
2692
#endif /* end K=24 block */
2697
vmaddfp vC00, va0, vb0, vC00
2701
vmaddfp vC10, va1, vb0, vC10
2705
vmaddfp vC20, va2, vb0, vC20
2709
vmaddfp vC30, va3, vb0, vC30
2713
vmaddfp vC01, va0, vb1, vC01
2714
vmaddfp vC11, va1, vb1, vC11
2715
vmaddfp vC21, va2, vb1, vC21
2716
vmaddfp vC31, va3, vb1, vC31
2720
vmaddfp vC02, va0, vb2, vC02
2721
vmaddfp vC12, va1, vb2, vC12
2723
vmaddfp vC22, va2, vb2, vC22
2725
vmaddfp vC32, va3, vb2, vC32
2729
vmaddfp vC03, va0, vb3, vC03
2733
vmaddfp vC13, va1, vb3, vC13
2737
vmaddfp vC23, va2, vb3, vC23
2741
vmaddfp vC33, va3, vb3, vC33
2748
#endif /* end K=28 block */
2753
vmaddfp vC00, vA0, vB0, vC00
2757
vmaddfp vC10, vA1, vB0, vC10
2761
vmaddfp vC20, vA2, vB0, vC20
2765
vmaddfp vC30, vA3, vB0, vC30
2769
vmaddfp vC01, vA0, vB1, vC01
2770
vmaddfp vC11, vA1, vB1, vC11
2771
vmaddfp vC21, vA2, vB1, vC21
2772
vmaddfp vC31, vA3, vB1, vC31
2776
vmaddfp vC02, vA0, vB2, vC02
2777
vmaddfp vC12, vA1, vB2, vC12
2778
vmaddfp vC22, vA2, vB2, vC22
2779
vmaddfp vC32, vA3, vB2, vC32
2783
vmaddfp vC03, vA0, vB3, vC03
2787
vmaddfp vC13, vA1, vB3, vC13
2791
vmaddfp vC23, vA2, vB3, vC23
2795
vmaddfp vC33, vA3, vB3, vC33
2802
#endif /* end K=32 block */
2807
vmaddfp vC00, va0, vb0, vC00
2811
vmaddfp vC10, va1, vb0, vC10
2815
vmaddfp vC20, va2, vb0, vC20
2819
vmaddfp vC30, va3, vb0, vC30
2823
vmaddfp vC01, va0, vb1, vC01
2824
vmaddfp vC11, va1, vb1, vC11
2825
vmaddfp vC21, va2, vb1, vC21
2826
vmaddfp vC31, va3, vb1, vC31
2830
vmaddfp vC02, va0, vb2, vC02
2831
vmaddfp vC12, va1, vb2, vC12
2832
vmaddfp vC22, va2, vb2, vC22
2833
vmaddfp vC32, va3, vb2, vC32
2837
vmaddfp vC03, va0, vb3, vC03
2841
vmaddfp vC13, va1, vb3, vC13
2845
vmaddfp vC23, va2, vb3, vC23
2849
vmaddfp vC33, va3, vb3, vC33
2856
#endif /* end K=36 block */
2861
vmaddfp vC00, vA0, vB0, vC00
2865
vmaddfp vC10, vA1, vB0, vC10
2869
vmaddfp vC20, vA2, vB0, vC20
2873
vmaddfp vC30, vA3, vB0, vC30
2877
vmaddfp vC01, vA0, vB1, vC01
2878
vmaddfp vC11, vA1, vB1, vC11
2879
vmaddfp vC21, vA2, vB1, vC21
2880
vmaddfp vC31, vA3, vB1, vC31
2884
vmaddfp vC02, vA0, vB2, vC02
2885
vmaddfp vC12, vA1, vB2, vC12
2886
vmaddfp vC22, vA2, vB2, vC22
2887
vmaddfp vC32, vA3, vB2, vC32
2891
vmaddfp vC03, vA0, vB3, vC03
2895
vmaddfp vC13, vA1, vB3, vC13
2899
vmaddfp vC23, vA2, vB3, vC23
2903
vmaddfp vC33, vA3, vB3, vC33
2910
#endif /* end K=40 block */
2915
vmaddfp vC00, va0, vb0, vC00
2919
vmaddfp vC10, va1, vb0, vC10
2923
vmaddfp vC20, va2, vb0, vC20
2927
vmaddfp vC30, va3, vb0, vC30
2931
vmaddfp vC01, va0, vb1, vC01
2932
vmaddfp vC11, va1, vb1, vC11
2933
vmaddfp vC21, va2, vb1, vC21
2934
vmaddfp vC31, va3, vb1, vC31
2938
vmaddfp vC02, va0, vb2, vC02
2939
vmaddfp vC12, va1, vb2, vC12
2940
vmaddfp vC22, va2, vb2, vC22
2941
vmaddfp vC32, va3, vb2, vC32
2945
vmaddfp vC03, va0, vb3, vC03
2949
vmaddfp vC13, va1, vb3, vC13
2953
vmaddfp vC23, va2, vb3, vC23
2957
vmaddfp vC33, va3, vb3, vC33
2964
#endif /* end K=44 block */
2969
vmaddfp vC00, vA0, vB0, vC00
2973
vmaddfp vC10, vA1, vB0, vC10
2977
vmaddfp vC20, vA2, vB0, vC20
2981
vmaddfp vC30, vA3, vB0, vC30
2985
vmaddfp vC01, vA0, vB1, vC01
2986
vmaddfp vC11, vA1, vB1, vC11
2987
vmaddfp vC21, vA2, vB1, vC21
2988
vmaddfp vC31, vA3, vB1, vC31
2992
vmaddfp vC02, vA0, vB2, vC02
2993
vmaddfp vC12, vA1, vB2, vC12
2994
vmaddfp vC22, vA2, vB2, vC22
2995
vmaddfp vC32, vA3, vB2, vC32
2999
vmaddfp vC03, vA0, vB3, vC03
3003
vmaddfp vC13, vA1, vB3, vC13
3007
vmaddfp vC23, vA2, vB3, vC23
3011
vmaddfp vC33, vA3, vB3, vC33
3018
#endif /* end K=48 block */
3023
vmaddfp vC00, va0, vb0, vC00
3027
vmaddfp vC10, va1, vb0, vC10
3031
vmaddfp vC20, va2, vb0, vC20
3035
vmaddfp vC30, va3, vb0, vC30
3039
vmaddfp vC01, va0, vb1, vC01
3040
vmaddfp vC11, va1, vb1, vC11
3041
vmaddfp vC21, va2, vb1, vC21
3042
vmaddfp vC31, va3, vb1, vC31
3046
vmaddfp vC02, va0, vb2, vC02
3047
vmaddfp vC12, va1, vb2, vC12
3048
vmaddfp vC22, va2, vb2, vC22
3049
vmaddfp vC32, va3, vb2, vC32
3053
vmaddfp vC03, va0, vb3, vC03
3057
vmaddfp vC13, va1, vb3, vC13
3061
vmaddfp vC23, va2, vb3, vC23
3065
vmaddfp vC33, va3, vb3, vC33
3072
#endif /* end K=52 block */
3077
vmaddfp vC00, vA0, vB0, vC00
3081
vmaddfp vC10, vA1, vB0, vC10
3085
vmaddfp vC20, vA2, vB0, vC20
3089
vmaddfp vC30, vA3, vB0, vC30
3093
vmaddfp vC01, vA0, vB1, vC01
3094
vmaddfp vC11, vA1, vB1, vC11
3095
vmaddfp vC21, vA2, vB1, vC21
3096
vmaddfp vC31, vA3, vB1, vC31
3100
vmaddfp vC02, vA0, vB2, vC02
3101
vmaddfp vC12, vA1, vB2, vC12
3102
vmaddfp vC22, vA2, vB2, vC22
3103
vmaddfp vC32, vA3, vB2, vC32
3107
vmaddfp vC03, vA0, vB3, vC03
3111
vmaddfp vC13, vA1, vB3, vC13
3115
vmaddfp vC23, vA2, vB3, vC23
3119
vmaddfp vC33, vA3, vB3, vC33
3126
#endif /* end K=56 block */
3131
vmaddfp vC00, va0, vb0, vC00
3135
vmaddfp vC10, va1, vb0, vC10
3139
vmaddfp vC20, va2, vb0, vC20
3143
vmaddfp vC30, va3, vb0, vC30
3147
vmaddfp vC01, va0, vb1, vC01
3148
vmaddfp vC11, va1, vb1, vC11
3149
vmaddfp vC21, va2, vb1, vC21
3150
vmaddfp vC31, va3, vb1, vC31
3154
vmaddfp vC02, va0, vb2, vC02
3155
vmaddfp vC12, va1, vb2, vC12
3156
vmaddfp vC22, va2, vb2, vC22
3157
vmaddfp vC32, va3, vb2, vC32
3161
vmaddfp vC03, va0, vb3, vC03
3165
vmaddfp vC13, va1, vb3, vC13
3169
vmaddfp vC23, va2, vb3, vC23
3173
vmaddfp vC33, va3, vb3, vC33
3180
#endif /* end K=60 block */
3185
vmaddfp vC00, vA0, vB0, vC00
3189
vmaddfp vC10, vA1, vB0, vC10
3193
vmaddfp vC20, vA2, vB0, vC20
3197
vmaddfp vC30, vA3, vB0, vC30
3201
vmaddfp vC01, vA0, vB1, vC01
3202
vmaddfp vC11, vA1, vB1, vC11
3203
vmaddfp vC21, vA2, vB1, vC21
3204
vmaddfp vC31, vA3, vB1, vC31
3208
vmaddfp vC02, vA0, vB2, vC02
3209
vmaddfp vC12, vA1, vB2, vC12
3210
vmaddfp vC22, vA2, vB2, vC22
3211
vmaddfp vC32, vA3, vB2, vC32
3215
vmaddfp vC03, vA0, vB3, vC03
3219
vmaddfp vC13, vA1, vB3, vC13
3223
vmaddfp vC23, vA2, vB3, vC23
3227
vmaddfp vC33, vA3, vB3, vC33
3234
#endif /* end K=64 block */
3239
vmaddfp vC00, va0, vb0, vC00
3243
vmaddfp vC10, va1, vb0, vC10
3247
vmaddfp vC20, va2, vb0, vC20
3251
vmaddfp vC30, va3, vb0, vC30
3255
vmaddfp vC01, va0, vb1, vC01
3256
vmaddfp vC11, va1, vb1, vC11
3257
vmaddfp vC21, va2, vb1, vC21
3258
vmaddfp vC31, va3, vb1, vC31
3262
vmaddfp vC02, va0, vb2, vC02
3263
vmaddfp vC12, va1, vb2, vC12
3264
vmaddfp vC22, va2, vb2, vC22
3265
vmaddfp vC32, va3, vb2, vC32
3269
vmaddfp vC03, va0, vb3, vC03
3273
vmaddfp vC13, va1, vb3, vC13
3277
vmaddfp vC23, va2, vb3, vC23
3281
vmaddfp vC33, va3, vb3, vC33
3288
#endif /* end K=68 block */
3293
vmaddfp vC00, vA0, vB0, vC00
3297
vmaddfp vC10, vA1, vB0, vC10
3301
vmaddfp vC20, vA2, vB0, vC20
3305
vmaddfp vC30, vA3, vB0, vC30
3309
vmaddfp vC01, vA0, vB1, vC01
3310
vmaddfp vC11, vA1, vB1, vC11
3311
vmaddfp vC21, vA2, vB1, vC21
3312
vmaddfp vC31, vA3, vB1, vC31
3316
vmaddfp vC02, vA0, vB2, vC02
3317
vmaddfp vC12, vA1, vB2, vC12
3318
vmaddfp vC22, vA2, vB2, vC22
3319
vmaddfp vC32, vA3, vB2, vC32
3323
vmaddfp vC03, vA0, vB3, vC03
3327
vmaddfp vC13, vA1, vB3, vC13
3331
vmaddfp vC23, vA2, vB3, vC23
3335
vmaddfp vC33, vA3, vB3, vC33
3342
#endif /* end K=72 block */
3347
vmaddfp vC00, va0, vb0, vC00
3351
vmaddfp vC10, va1, vb0, vC10
3355
vmaddfp vC20, va2, vb0, vC20
3359
vmaddfp vC30, va3, vb0, vC30
3363
vmaddfp vC01, va0, vb1, vC01
3364
vmaddfp vC11, va1, vb1, vC11
3365
vmaddfp vC21, va2, vb1, vC21
3366
vmaddfp vC31, va3, vb1, vC31
3370
vmaddfp vC02, va0, vb2, vC02
3371
vmaddfp vC12, va1, vb2, vC12
3372
vmaddfp vC22, va2, vb2, vC22
3373
vmaddfp vC32, va3, vb2, vC32
3377
vmaddfp vC03, va0, vb3, vC03
3381
vmaddfp vC13, va1, vb3, vC13
3385
vmaddfp vC23, va2, vb3, vC23
3389
vmaddfp vC33, va3, vb3, vC33
3396
#endif /* end K=76 block */
3401
vmaddfp vC00, vA0, vB0, vC00
3405
vmaddfp vC10, vA1, vB0, vC10
3409
vmaddfp vC20, vA2, vB0, vC20
3413
vmaddfp vC30, vA3, vB0, vC30
3417
vmaddfp vC01, vA0, vB1, vC01
3418
vmaddfp vC11, vA1, vB1, vC11
3419
vmaddfp vC21, vA2, vB1, vC21
3420
vmaddfp vC31, vA3, vB1, vC31
3424
vmaddfp vC02, vA0, vB2, vC02
3425
vmaddfp vC12, vA1, vB2, vC12
3426
vmaddfp vC22, vA2, vB2, vC22
3427
vmaddfp vC32, vA3, vB2, vC32
3431
vmaddfp vC03, vA0, vB3, vC03
3435
vmaddfp vC13, vA1, vB3, vC13
3439
vmaddfp vC23, vA2, vB3, vC23
3443
vmaddfp vC33, vA3, vB3, vC33
3450
#endif /* end K=80 block */
3455
vmaddfp vC00, va0, vb0, vC00
3459
vmaddfp vC10, va1, vb0, vC10
3463
vmaddfp vC20, va2, vb0, vC20
3467
vmaddfp vC30, va3, vb0, vC30
3471
vmaddfp vC01, va0, vb1, vC01
3472
vmaddfp vC11, va1, vb1, vC11
3473
vmaddfp vC21, va2, vb1, vC21
3474
vmaddfp vC31, va3, vb1, vC31
3478
vmaddfp vC02, va0, vb2, vC02
3479
vmaddfp vC12, va1, vb2, vC12
3480
vmaddfp vC22, va2, vb2, vC22
3481
vmaddfp vC32, va3, vb2, vC32
3485
vmaddfp vC03, va0, vb3, vC03
3489
vmaddfp vC13, va1, vb3, vC13
3493
vmaddfp vC23, va2, vb3, vC23
3497
vmaddfp vC33, va3, vb3, vC33
3504
#endif /* end K=84 block */
3505
#if KB > 88 /* HERE HERE */
3509
vmaddfp vC00, vA0, vB0, vC00
3513
vmaddfp vC20, vA2, vB0, vC20
3517
vmaddfp vC30, vA3, vB0, vC30
3521
vmaddfp vC01, vA0, vB1, vC01
3522
vmaddfp vC11, vA1, vB1, vC11
3523
vmaddfp vC21, vA2, vB1, vC21
3524
vmaddfp vC31, vA3, vB1, vC31
3528
vmaddfp vC02, vA0, vB2, vC02
3529
vmaddfp vC22, vA2, vB2, vC22
3530
vmaddfp vC32, vA3, vB2, vC32
3534
vmaddfp vC03, vA0, vB3, vC03
3538
vmaddfp vC13, vA1, vB3, vC13
3542
vmaddfp vC23, vA2, vB3, vC23
3546
vmaddfp vC33, vA3, vB3, vC33
3553
#endif /* end K=88 block */
3558
vmaddfp vC00, va0, vb0, vC00
3562
vmaddfp vC10, va1, vb0, vC10
3566
vmaddfp vC20, va2, vb0, vC20
3570
vmaddfp vC30, va3, vb0, vC30
3574
vmaddfp vC01, va0, vb1, vC01
3575
vmaddfp vC11, va1, vb1, vC11
3576
vmaddfp vC21, va2, vb1, vC21
3577
vmaddfp vC31, va3, vb1, vC31
3581
vmaddfp vC02, va0, vb2, vC02
3582
vmaddfp vC12, va1, vb2, vC12
3583
vmaddfp vC22, va2, vb2, vC22
3584
vmaddfp vC32, va3, vb2, vC32
3588
vmaddfp vC03, va0, vb3, vC03
3592
vmaddfp vC13, va1, vb3, vC13
3596
vmaddfp vC23, va2, vb3, vC23
3600
vmaddfp vC33, va3, vb3, vC33
3607
#endif /* end K=92 block */
3612
vmaddfp vC00, vA0, vB0, vC00
3616
vmaddfp vC10, vA1, vB0, vC10
3620
vmaddfp vC20, vA2, vB0, vC20
3624
vmaddfp vC30, vA3, vB0, vC30
3628
vmaddfp vC01, vA0, vB1, vC01
3629
vmaddfp vC11, vA1, vB1, vC11
3630
vmaddfp vC21, vA2, vB1, vC21
3631
vmaddfp vC31, vA3, vB1, vC31
3635
vmaddfp vC02, vA0, vB2, vC02
3636
vmaddfp vC12, vA1, vB2, vC12
3637
vmaddfp vC22, vA2, vB2, vC22
3638
vmaddfp vC32, vA3, vB2, vC32
3642
vmaddfp vC03, vA0, vB3, vC03
3646
vmaddfp vC13, vA1, vB3, vC13
3650
vmaddfp vC23, vA2, vB3, vC23
3654
vmaddfp vC33, vA3, vB3, vC33
3661
#endif /* end K=96 block */
3666
vmaddfp vC00, va0, vb0, vC00
3670
vmaddfp vC10, va1, vb0, vC10
3674
vmaddfp vC20, va2, vb0, vC20
3678
vmaddfp vC30, va3, vb0, vC30
3682
vmaddfp vC01, va0, vb1, vC01
3683
vmaddfp vC11, va1, vb1, vC11
3684
vmaddfp vC21, va2, vb1, vC21
3685
vmaddfp vC31, va3, vb1, vC31
3689
vmaddfp vC02, va0, vb2, vC02
3690
vmaddfp vC12, va1, vb2, vC12
3691
vmaddfp vC22, va2, vb2, vC22
3692
vmaddfp vC32, va3, vb2, vC32
3696
vmaddfp vC03, va0, vb3, vC03
3700
vmaddfp vC13, va1, vb3, vC13
3704
vmaddfp vC23, va2, vb3, vC23
3708
vmaddfp vC33, va3, vb3, vC33
3715
#endif /* end K=100 block */
3720
vmaddfp vC00, vA0, vB0, vC00
3724
vmaddfp vC10, vA1, vB0, vC10
3728
vmaddfp vC20, vA2, vB0, vC20
3732
vmaddfp vC30, vA3, vB0, vC30
3736
vmaddfp vC01, vA0, vB1, vC01
3737
vmaddfp vC11, vA1, vB1, vC11
3738
vmaddfp vC21, vA2, vB1, vC21
3739
vmaddfp vC31, vA3, vB1, vC31
3743
vmaddfp vC02, vA0, vB2, vC02
3744
vmaddfp vC12, vA1, vB2, vC12
3745
vmaddfp vC22, vA2, vB2, vC22
3746
vmaddfp vC32, vA3, vB2, vC32
3750
vmaddfp vC03, vA0, vB3, vC03
3754
vmaddfp vC13, vA1, vB3, vC13
3758
vmaddfp vC23, vA2, vB3, vC23
3762
vmaddfp vC33, vA3, vB3, vC33
3769
#endif /* end K=104 block */
3774
vmaddfp vC00, va0, vb0, vC00
3778
vmaddfp vC10, va1, vb0, vC10
3782
vmaddfp vC20, va2, vb0, vC20
3786
vmaddfp vC30, va3, vb0, vC30
3790
vmaddfp vC01, va0, vb1, vC01
3791
vmaddfp vC11, va1, vb1, vC11
3792
vmaddfp vC21, va2, vb1, vC21
3793
vmaddfp vC31, va3, vb1, vC31
3797
vmaddfp vC02, va0, vb2, vC02
3798
vmaddfp vC12, va1, vb2, vC12
3799
vmaddfp vC22, va2, vb2, vC22
3800
vmaddfp vC32, va3, vb2, vC32
3804
vmaddfp vC03, va0, vb3, vC03
3808
vmaddfp vC13, va1, vb3, vC13
3812
vmaddfp vC23, va2, vb3, vC23
3816
vmaddfp vC33, va3, vb3, vC33
3823
#endif /* end K=108 block */
3828
vmaddfp vC00, vA0, vB0, vC00
3832
vmaddfp vC10, vA1, vB0, vC10
3836
vmaddfp vC20, vA2, vB0, vC20
3840
vmaddfp vC30, vA3, vB0, vC30
3844
vmaddfp vC01, vA0, vB1, vC01
3845
vmaddfp vC11, vA1, vB1, vC11
3846
vmaddfp vC21, vA2, vB1, vC21
3847
vmaddfp vC31, vA3, vB1, vC31
3851
vmaddfp vC02, vA0, vB2, vC02
3852
vmaddfp vC12, vA1, vB2, vC12
3853
vmaddfp vC22, vA2, vB2, vC22
3854
vmaddfp vC32, vA3, vB2, vC32
3858
vmaddfp vC03, vA0, vB3, vC03
3862
vmaddfp vC13, vA1, vB3, vC13
3866
vmaddfp vC23, vA2, vB3, vC23
3870
vmaddfp vC33, vA3, vB3, vC33
3877
#endif /* end K=112 block */
3882
vmaddfp vC00, va0, vb0, vC00
3886
vmaddfp vC10, va1, vb0, vC10
3890
vmaddfp vC20, va2, vb0, vC20
3894
vmaddfp vC30, va3, vb0, vC30
3898
vmaddfp vC01, va0, vb1, vC01
3899
vmaddfp vC11, va1, vb1, vC11
3900
vmaddfp vC21, va2, vb1, vC21
3901
vmaddfp vC31, va3, vb1, vC31
3905
vmaddfp vC02, va0, vb2, vC02
3906
vmaddfp vC12, va1, vb2, vC12
3907
vmaddfp vC22, va2, vb2, vC22
3908
vmaddfp vC32, va3, vb2, vC32
3912
vmaddfp vC03, va0, vb3, vC03
3916
vmaddfp vC13, va1, vb3, vC13
3920
vmaddfp vC23, va2, vb3, vC23
3924
vmaddfp vC33, va3, vb3, vC33
3931
#endif /* end K=116 block */
3936
vmaddfp vC00, vA0, vB0, vC00
3940
vmaddfp vC10, vA1, vB0, vC10
3944
vmaddfp vC20, vA2, vB0, vC20
3948
vmaddfp vC30, vA3, vB0, vC30
3952
vmaddfp vC01, vA0, vB1, vC01
3953
vmaddfp vC11, vA1, vB1, vC11
3954
vmaddfp vC21, vA2, vB1, vC21
3955
vmaddfp vC31, vA3, vB1, vC31
3959
vmaddfp vC02, vA0, vB2, vC02
3960
vmaddfp vC12, vA1, vB2, vC12
3961
vmaddfp vC22, vA2, vB2, vC22
3962
vmaddfp vC32, vA3, vB2, vC32
3966
vmaddfp vC03, vA0, vB3, vC03
3970
vmaddfp vC13, vA1, vB3, vC13
3974
vmaddfp vC23, vA2, vB3, vC23
3978
vmaddfp vC33, vA3, vB3, vC33
3985
#endif /* end K=120 block */
3990
vmaddfp vC00, va0, vb0, vC00
3994
vmaddfp vC10, va1, vb0, vC10
3998
vmaddfp vC20, va2, vb0, vC20
4002
vmaddfp vC30, va3, vb0, vC30
4006
vmaddfp vC01, va0, vb1, vC01
4007
vmaddfp vC11, va1, vb1, vC11
4008
vmaddfp vC21, va2, vb1, vC21
4009
vmaddfp vC31, va3, vb1, vC31
4013
vmaddfp vC02, va0, vb2, vC02
4014
vmaddfp vC12, va1, vb2, vC12
4015
vmaddfp vC22, va2, vb2, vC22
4016
vmaddfp vC32, va3, vb2, vC32
4020
vmaddfp vC03, va0, vb3, vC03
4024
vmaddfp vC13, va1, vb3, vC13
4028
vmaddfp vC23, va2, vb3, vC23
4032
vmaddfp vC33, va3, vb3, vC33
4039
#endif /* end K=124 block */
4042
* Reduce vectors to scalars stored vC0x
4044
vspltisb vb0, 8 /* vb0={8,8,8,8,8,8,8,8,8, 8, 8, 8, 8, 8, 8} */
4045
vxor vb1, vb1, vb1 /* vb1={0...0} */
4046
vsldoi vb0, vb1, vb0, 8 /* vb0={0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8} */
4047
lvsl vb1, 0, r1 /*vb1={0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}*/
4048
vaddubm vb1, vb0, vb1 /*vb1={0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23}*/
4049
vaddubm vb1, vb0, vb1 /*vb1={0,1,2,3,4,5,6,7,24,25,26,27,28,29,30,31}*/
4050
/* rC00 = {c0a, c0b, c0c, c0d} */
4051
/* rC10 = {c1a, c1b, c1c, c1d} */
4052
/* rC20 = {c2a, c2b, c2c, c2d} */
4053
/* rC30 = {c3a, c3b, c3c, c3d} */
4054
vmrglw vb2, vC00, vC10 // vb2 = {c0c, c1c, c0d, c1d}
4055
vmrghw vC00, vC00, vC10 // vC00 = {c0a, c1a, c0b, c1b}
4056
vaddfp vC00, vC00, vb2 // vC00 = {c0ac, c1ac, c0bd, c1bd}
4057
vmrglw vb3, vC20, vC30 // vb3 = {c2c, c3c, c2d, c3d}
4058
vmrghw vC20, vC20, vC30 // vC20 = {c2a, c3a, c2b, c3b}
4059
vaddfp vC20, vC20, vb3 // vC20 = {c2ac, c3ac, c2bd, c3bd}
4060
vperm vb2, vC00, vC20,vb1 // vb2 = {c0ac, c1ac, c2bd, c3bd}
4061
vsldoi vC00, vC00, vC20,8 // vC00 = {c0bd, c1bd, c2ac, c3ac}
4062
vaddfp vC00, vC00, vb2 // vC00 = {c0acbd,c1acbd,c2acbd,c3acbd}
4064
vmrglw vb2, vC01, vC11 // vb2 = {c0c, c1c, c0d, c1d}
4065
vmrghw vC01, vC01, vC11 // vC01 = {c0a, c1a, c0b, c1b}
4066
vaddfp vC01, vC01, vb2 // vC01 = {c0ac, c1ac, c0bd, c1bd}
4067
vmrglw vb3, vC21, vC31 // vb3 = {c2c, c3c, c2d, c3d}
4068
vmrghw vC21, vC21, vC31 // vC21 = {c2a, c3a, c2b, c3b}
4069
vaddfp vC21, vC21, vb3 // vC21 = {c2ac, c3ac, c2bd, c3bd}
4070
vperm vb2, vC01, vC21,vb1 // vb2 = {c0ac, c1ac, c2bd, c3bd}
4071
vsldoi vC01, vC01, vC21,8 // vC01 = {c0bd, c1bd, c2ac, c3ac}
4072
vaddfp vC01, vC01, vb2 // vC01 = {c0acbd,c1acbd,c0acbd,c1acbd}
4074
vmrglw vb2, vC02, vC12 // vb2 = {c0c, c1c, c0d, c1d}
4075
vmrghw vC02, vC02, vC12 // vC02 = {c0a, c1a, c0b, c1b}
4076
vaddfp vC02, vC02, vb2 // vC02 = {c0ac, c1ac, c0bd, c1bd}
4077
vmrglw vb3, vC22, vC32 // vb3 = {c2c, c3c, c2d, c3d}
4078
vmrghw vC22, vC22, vC32 // vC22 = {c2a, c3a, c2b, c3b}
4079
vaddfp vC22, vC22, vb3 // vC22 = {c2ac, c3ac, c2bd, c3bd}
4080
vperm vb2, vC02, vC22,vb1 // vb2 = {c0ac, c1ac, c2bd, c3bd}
4081
vsldoi vC02, vC02, vC22,8 // vC02 = {c0bd, c1bd, c2ac, c3ac}
4082
vaddfp vC02, vC02, vb2 // vC02 = {c0acbd,c1acbd,c0acbd,c1acbd}
4084
vmrglw vb2, vC03, vC13 // vb2 = {c0c, c1c, c0d, c1d}
4085
vmrghw vC03, vC03, vC13 // vC03 = {c0a, c1a, c0b, c1b}
4086
vaddfp vC03, vC03, vb2 // vC03 = {c0ac, c1ac, c0bd, c1bd}
4087
vmrglw vb3, vC23, vC33 // vb3 = {c2c, c3c, c2d, c3d}
4088
vmrghw vC23, vC23, vC33 // vC23 = {c2a, c3a, c2b, c3b}
4089
vaddfp vC23, vC23, vb3 // vC23 = {c2ac, c3ac, c2bd, c3bd}
4090
vperm vb2, vC03, vC23,vb1 // vb2 = {c0ac, c1ac, c2bd, c3bd}
4091
vsldoi vC03, vC03, vC23,8 // vC03 = {c0bd, c1bd, c2ac, c3ac}
4092
vaddfp vC03, vC03, vb2 // vC03 = {c0acbd,c1acbd,c0acbd,c1acbd}
4094
* Store it back out, and add it to C if necessary
4112
fmadd rC0, rc0, rbeta, rC0
4113
fmadd rC1, rc1, rbeta, rC1
4114
fmadd rC2, rc2, rbeta, rC2
4115
fmadd rC3, rc3, rbeta, rC3
4139
fmadd rC0, rc0, rbeta, rC0
4140
fmadd rC1, rc1, rbeta, rC1
4141
fmadd rC2, rc2, rbeta, rC2
4142
fmadd rC3, rc3, rbeta, rC3
4166
fmadd rC0, rc0, rbeta, rC0
4167
fmadd rC1, rc1, rbeta, rC1
4168
fmadd rC2, rc2, rbeta, rC2
4169
fmadd rC3, rc3, rbeta, rC3
4193
fmadd rC0, rc0, rbeta, rC0
4194
fmadd rC1, rc1, rbeta, rC1
4195
fmadd rC2, rc2, rbeta, rC2
4196
fmadd rC3, rc3, rbeta, rC3
4204
* Mov ptrs, while(M)
4211
* Move ptrs, while(N)
4213
sub pA0, pA0, incAn /* pA0 -= M*KB */
4214
addi pB0, pB0, KB*4*4 /* pB0 += KB*4 */