3
#error "This kernel requires AMD's 4-operand FMAC (FM4)"
6
#define CMUL(arg_) 2*arg_
8
#define CMUL(arg_) arg_
11
*Integer register usage shown by these defines
47
* Save some inst space by using short version of instructions
55
%rdi/4 %rsi/8 %rdx/12 %xmm0/16
56
void ATL_USERMM(const int M, const int N, const int K, const TYPE alpha,
57
%rcx/24 %r8/28 %r9/32 8/36
58
const TYPE *A, const int lda, const TYPE *B, const int ldb,
60
const TYPE beta, TYPE *C, const int ldc)
64
.global ATL_asmdecor(ATL_USERMM)
66
ATL_asmdecor(ATL_USERMM):
68
* Save callee-saved iregs
79
* Setup input parameters
85
movaps %xmm1, BETAOFF(%rsp)
96
movslq 24(%rsp), incCn
98
prefetcht0 (pB0,lda,2)
100
* incCn = (4*ldc-M)*sizeof
112
* pA0 += 128; pB0 += 128
118
* ldb = ldb*sizeof; ldb3 = ldb*3; ldc3 = ldc*3
122
lea (ldb,ldb,2), ldb3
123
lea (ldc,ldc,2), ldc3
130
* pfA = A + lda*M ; incAn = lda*M
136
lea -128(pA0, pfA), pfA
138
lea -128(pB0,ldb,2), pfB
142
* K=0-3, with rCxx starting at zero
144
movaps -128(pB0), rB0
146
movaps -128(pA0), rA0
147
vfmaddps rC13, rA0, rB0, rC00
148
movaps -128(pB0,ldb), rB1
149
vfmaddps rC13, rA0, rB1, rC01
150
movaps -128(pB0,ldb,2), rB2
151
vfmaddps rC13, rA0, rB2, rC02
152
movaps -128(pB0,ldb3), rB3
153
vfmaddps rC13, rA0, rB3, rC03
154
movaps -128(pA0,lda), rA1
155
vfmaddps rC13, rA1, rB0, rC10
157
movaps -112(pA0), ra0
159
vfmaddps rC13, rA1, rB1, rC11
161
movaps -112(pA0,lda), ra1
163
vfmaddps rC13, rA1, rB2, rC12
165
movaps -112(pB0), rB0
167
vfmaddps rC13, rA1, rB3, rC13
169
movaps -112(pB0,ldb), rB1
175
vfmaddps rC00, ra0, rB0, rC00
176
movaps -112(pB0,ldb,2), rB2
177
vfmaddps rC10, ra1, rB0, rC10
178
movaps -112(pB0,ldb3), rB3
179
vfmaddps rC01, ra0, rB1, rC01
183
vfmaddps rC11, ra1, rB1, rC11
185
movaps -96(pA0,lda), rA1
187
vfmaddps rC02, ra0, rB2, rC02
191
vfmaddps rC12, ra1, rB2, rC12
193
movaps -96(pB0,ldb), rB1
195
vfmaddps rC03, ra0, rB3, rC03
197
movaps -96(pB0,ldb,2), rB2
199
vfmaddps rC13, ra1, rB3, rC13
201
movaps -96(pB0,ldb3), rB3
207
vfmaddps rC00, rA0, rB0, rC00
211
vfmaddps rC10, rA1, rB0, rC10
213
movaps -80(pA0,lda), ra1
215
vfmaddps rC01, rA0, rB1, rC01
218
#elif KB == 12 && defined(BETA1)
220
vfmaddps rC11, rA1, rB1, rC11
222
movaps -80(pB0,ldb), rB1
223
#elif KB == 12 && defined(BETA1)
225
vfmaddps rC02, rA0, rB2, rC02
230
vfmaddps rC12, rA1, rB2, rC12
232
movaps -80(pB0,ldb,2), rB2
233
#elif KB == 12 && defined(BETA1)
235
vfmaddps rC03, rA0, rB3, rC03
236
#if KB == 12 && defined(BETA1)
238
vfmaddps rC13, rA1, rB3, rC13
240
movaps -80(pB0,ldb3), rB3
241
#elif KB == 12 && defined(BETA0)
245
vfmaddps rC00, ra0, rB0, rC00
249
vfmaddps rC10, ra1, rB0, rC10
251
movaps -64(pA0,lda), rA1
253
vfmaddps rC01, ra0, rB1, rC01
256
#elif KB == 16 && defined(BETA1)
258
vfmaddps rC11, ra1, rB1, rC11
260
movaps -64(pB0,ldb), rB1
261
#elif KB == 16 && defined(BETA1)
263
vfmaddps rC02, ra0, rB2, rC02
268
vfmaddps rC12, ra1, rB2, rC12
270
movaps -64(pB0,ldb,2), rB2
271
#elif KB == 16 && defined(BETA1)
273
vfmaddps rC03, ra0, rB3, rC03
274
#if KB == 16 && defined(BETA1)
276
vfmaddps rC13, ra1, rB3, rC13
278
movaps -64(pB0,ldb3), rB3
279
#elif KB == 16 && defined(BETA1)
283
vfmaddps rC00, rA0, rB0, rC00
287
vfmaddps rC10, rA1, rB0, rC10
289
movaps -48(pA0,lda), ra1
291
vfmaddps rC01, rA0, rB1, rC01
294
#elif KB == 20 && defined(BETA1)
296
vfmaddps rC11, rA1, rB1, rC11
298
movaps -48(pB0,ldb), rB1
299
#elif KB == 20 && defined(BETA1)
301
vfmaddps rC02, rA0, rB2, rC02
302
#if KB == 20 && defined(BETA0)
304
vfmaddps rC12, rA1, rB2, rC12
306
movaps -48(pB0,ldb,2), rB2
307
#elif KB == 20 && defined(BETA1)
309
vfmaddps rC03, rA0, rB3, rC03
310
#if KB == 20 && defined(BETA1)
312
vfmaddps rC13, rA1, rB3, rC13
314
movaps -48(pB0,ldb3), rB3
315
#elif KB == 20 && defined(BETA0)
319
vfmaddps rC00, ra0, rB0, rC00
323
vfmaddps rC10, ra1, rB0, rC10
325
movaps -32(pA0,lda), rA1
327
vfmaddps rC01, ra0, rB1, rC01
330
#elif KB == 24 && defined(BETA1)
332
vfmaddps rC11, ra1, rB1, rC11
334
movaps -32(pB0,ldb), rB1
335
#elif KB == 24 && defined(BETA1)
337
vfmaddps rC02, ra0, rB2, rC02
338
#if KB == 24 && defined(BETA1)
340
vfmaddps rC12, ra1, rB2, rC12
342
movaps -32(pB0,ldb,2), rB2
343
#elif KB == 24 && defined(BETA1)
345
vfmaddps rC03, ra0, rB3, rC03
346
#if KB == 24 && defined(BETA1)
348
vfmaddps rC13, ra1, rB3, rC13
350
movaps -32(pB0,ldb3), rB3
351
#elif KB == 24 && defined(BETA1)
355
vfmaddps rC00, rA0, rB0, rC00
359
vfmaddps rC10, rA1, rB0, rC10
361
movaps -16(pA0,lda), ra1
363
vfmaddps rC01, rA0, rB1, rC01
366
#elif KB == 28 && defined(BETA1)
368
vfmaddps rC11, rA1, rB1, rC11
370
movaps -16(pB0,ldb), rB1
371
#elif KB == 28 && defined(BETA1)
373
vfmaddps rC02, rA0, rB2, rC02
374
#if KB == 28 && defined(BETA0)
376
vfmaddps rC12, rA1, rB2, rC12
378
movaps -16(pB0,ldb,2), rB2
379
#elif KB == 28 && defined(BETA1)
381
vfmaddps rC03, rA0, rB3, rC03
382
#if KB == 28 && defined(BETA1)
384
vfmaddps rC13, rA1, rB3, rC13
386
movaps -16(pB0,ldb3), rB3
387
#elif KB == 28 && defined(BETA0)
391
vfmaddps rC00, ra0, rB0, rC00
395
vfmaddps rC10, ra1, rB0, rC10
397
movaps 0(pA0,lda), rA1
399
vfmaddps rC01, ra0, rB1, rC01
402
#elif KB == 32 && defined(BETA1)
404
vfmaddps rC11, ra1, rB1, rC11
406
movaps 0(pB0,ldb), rB1
407
#elif KB == 32 && defined(BETA1)
409
vfmaddps rC02, ra0, rB2, rC02
410
#if KB == 32 && defined(BETA1)
412
vfmaddps rC12, ra1, rB2, rC12
414
movaps 0(pB0,ldb,2), rB2
415
#elif KB == 32 && defined(BETA1)
417
vfmaddps rC03, ra0, rB3, rC03
418
#if KB == 32 && defined(BETA1)
420
vfmaddps rC13, ra1, rB3, rC13
422
movaps 0(pB0,ldb3), rB3
423
#elif KB == 32 && defined(BETA1)
427
vfmaddps rC00, rA0, rB0, rC00
431
vfmaddps rC10, rA1, rB0, rC10
433
movaps 16(pA0,lda), ra1
435
vfmaddps rC01, rA0, rB1, rC01
438
#elif KB == 36 && defined(BETA1)
440
vfmaddps rC11, rA1, rB1, rC11
442
movaps 16(pB0,ldb), rB1
443
#elif KB == 36 && defined(BETA1)
445
vfmaddps rC02, rA0, rB2, rC02
446
#if KB == 36 && defined(BETA0)
448
vfmaddps rC12, rA1, rB2, rC12
450
movaps 16(pB0,ldb,2), rB2
451
#elif KB == 36 && defined(BETA1)
453
vfmaddps rC03, rA0, rB3, rC03
454
#if KB == 36 && defined(BETA1)
456
vfmaddps rC13, rA1, rB3, rC13
458
movaps 16(pB0,ldb3), rB3
459
#elif KB == 36 && defined(BETA0)
463
vfmaddps rC00, ra0, rB0, rC00
467
vfmaddps rC10, ra1, rB0, rC10
469
movaps 32(pA0,lda), rA1
471
vfmaddps rC01, ra0, rB1, rC01
474
#elif KB == 40 && defined(BETA1)
476
vfmaddps rC11, ra1, rB1, rC11
478
movaps 32(pB0,ldb), rB1
479
#elif KB == 40 && defined(BETA1)
481
vfmaddps rC02, ra0, rB2, rC02
482
#if KB == 40 && defined(BETA1)
484
vfmaddps rC12, ra1, rB2, rC12
486
movaps 32(pB0,ldb,2), rB2
487
#elif KB == 40 && defined(BETA1)
489
vfmaddps rC03, ra0, rB3, rC03
490
#if KB == 40 && defined(BETA1)
492
vfmaddps rC13, ra1, rB3, rC13
494
movaps 32(pB0,ldb3), rB3
495
#elif KB == 40 && defined(BETA1)
499
vfmaddps rC00, rA0, rB0, rC00
503
vfmaddps rC10, rA1, rB0, rC10
505
movaps 48(pA0,lda), ra1
507
vfmaddps rC01, rA0, rB1, rC01
510
#elif KB == 44 && defined(BETA1)
512
vfmaddps rC11, rA1, rB1, rC11
514
movaps 48(pB0,ldb), rB1
515
#elif KB == 44 && defined(BETA1)
517
vfmaddps rC02, rA0, rB2, rC02
518
#if KB == 44 && defined(BETA0)
520
vfmaddps rC12, rA1, rB2, rC12
522
movaps 48(pB0,ldb,2), rB2
523
#elif KB == 44 && defined(BETA1)
525
vfmaddps rC03, rA0, rB3, rC03
526
#if KB == 44 && defined(BETA1)
528
vfmaddps rC13, rA1, rB3, rC13
530
movaps 48(pB0,ldb3), rB3
531
#elif KB == 44 && defined(BETA0)
535
vfmaddps rC00, ra0, rB0, rC00
539
vfmaddps rC10, ra1, rB0, rC10
541
movaps 64(pA0,lda), rA1
543
vfmaddps rC01, ra0, rB1, rC01
546
#elif KB == 48 && defined(BETA1)
548
vfmaddps rC11, ra1, rB1, rC11
550
movaps 64(pB0,ldb), rB1
551
#elif KB == 48 && defined(BETA1)
553
vfmaddps rC02, ra0, rB2, rC02
554
#if KB == 48 && defined(BETA1)
556
vfmaddps rC12, ra1, rB2, rC12
558
movaps 64(pB0,ldb,2), rB2
559
#elif KB == 48 && defined(BETA1)
561
vfmaddps rC03, ra0, rB3, rC03
562
#if KB == 48 && defined(BETA1)
564
vfmaddps rC13, ra1, rB3, rC13
566
movaps 64(pB0,ldb3), rB3
567
#elif KB == 48 && defined(BETA1)
571
vfmaddps rC00, rA0, rB0, rC00
575
vfmaddps rC10, rA1, rB0, rC10
577
movaps 80(pA0,lda), ra1
579
vfmaddps rC01, rA0, rB1, rC01
582
#elif KB == 52 && defined(BETA1)
584
vfmaddps rC11, rA1, rB1, rC11
586
movaps 80(pB0,ldb), rB1
587
#elif KB == 52 && defined(BETA1)
589
vfmaddps rC02, rA0, rB2, rC02
590
#if KB == 52 && defined(BETA0)
592
vfmaddps rC12, rA1, rB2, rC12
594
movaps 80(pB0,ldb,2), rB2
595
#elif KB == 52 && defined(BETA1)
597
vfmaddps rC03, rA0, rB3, rC03
598
#if KB == 52 && defined(BETA1)
600
vfmaddps rC13, rA1, rB3, rC13
602
movaps 80(pB0,ldb3), rB3
603
#elif KB == 52 && defined(BETA0)
607
vfmaddps rC00, ra0, rB0, rC00
611
vfmaddps rC10, ra1, rB0, rC10
613
movaps 96(pA0,lda), rA1
615
vfmaddps rC01, ra0, rB1, rC01
618
#elif KB == 56 && defined(BETA1)
620
vfmaddps rC11, ra1, rB1, rC11
622
movaps 96(pB0,ldb), rB1
623
#elif KB == 56 && defined(BETA1)
625
vfmaddps rC02, ra0, rB2, rC02
626
#if KB == 56 && defined(BETA1)
628
vfmaddps rC12, ra1, rB2, rC12
630
movaps 96(pB0,ldb,2), rB2
631
#elif KB == 56 && defined(BETA1)
633
vfmaddps rC03, ra0, rB3, rC03
634
#if KB == 56 && defined(BETA1)
636
vfmaddps rC13, ra1, rB3, rC13
638
movaps 96(pB0,ldb3), rB3
639
#elif KB == 56 && defined(BETA1)
643
vfmaddps rC00, rA0, rB0, rC00
647
vfmaddps rC10, rA1, rB0, rC10
649
movaps 112(pA0,lda), ra1
651
vfmaddps rC01, rA0, rB1, rC01
654
#elif KB == 60 && defined(BETA1)
656
vfmaddps rC11, rA1, rB1, rC11
658
movaps 112(pB0,ldb), rB1
659
#elif KB == 60 && defined(BETA1)
661
vfmaddps rC02, rA0, rB2, rC02
662
#if KB == 60 && defined(BETA0)
664
vfmaddps rC12, rA1, rB2, rC12
666
movaps 112(pB0,ldb,2), rB2
667
#elif KB == 60 && defined(BETA1)
669
vfmaddps rC03, rA0, rB3, rC03
670
#if KB == 60 && defined(BETA1)
672
vfmaddps rC13, rA1, rB3, rC13
674
movaps 112(pB0,ldb3), rB3
675
#elif KB == 60 && defined(BETA0)
679
vfmaddps rC00, ra0, rB0, rC00
683
vfmaddps rC10, ra1, rB0, rC10
685
movaps 128(pA0,lda), rA1
687
vfmaddps rC01, ra0, rB1, rC01
690
#elif KB == 64 && defined(BETA1)
692
vfmaddps rC11, ra1, rB1, rC11
694
movaps 128(pB0,ldb), rB1
695
#elif KB == 64 && defined(BETA1)
697
vfmaddps rC02, ra0, rB2, rC02
698
#if KB == 64 && defined(BETA1)
700
vfmaddps rC12, ra1, rB2, rC12
702
movaps 128(pB0,ldb,2), rB2
703
#elif KB == 64 && defined(BETA1)
705
vfmaddps rC03, ra0, rB3, rC03
706
#if KB == 64 && defined(BETA1)
708
vfmaddps rC13, ra1, rB3, rC13
710
movaps 128(pB0,ldb3), rB3
711
#elif KB == 64 && defined(BETA1)
715
vfmaddps rC00, rA0, rB0, rC00
719
vfmaddps rC10, rA1, rB0, rC10
721
movaps 144(pA0,lda), ra1
723
vfmaddps rC01, rA0, rB1, rC01
726
#elif KB == 68 && defined(BETA1)
728
vfmaddps rC11, rA1, rB1, rC11
730
movaps 144(pB0,ldb), rB1
731
#elif KB == 68 && defined(BETA1)
733
vfmaddps rC02, rA0, rB2, rC02
734
#if KB == 68 && defined(BETA0)
736
vfmaddps rC12, rA1, rB2, rC12
738
movaps 144(pB0,ldb,2), rB2
739
#elif KB == 68 && defined(BETA1)
741
vfmaddps rC03, rA0, rB3, rC03
742
#if KB == 68 && defined(BETA1)
744
vfmaddps rC13, rA1, rB3, rC13
746
movaps 144(pB0,ldb3), rB3
747
#elif KB == 68 && defined(BETA0)
751
vfmaddps rC00, ra0, rB0, rC00
755
vfmaddps rC10, ra1, rB0, rC10
757
movaps 160(pA0,lda), rA1
759
vfmaddps rC01, ra0, rB1, rC01
762
#elif KB == 72 && defined(BETA1)
764
vfmaddps rC11, ra1, rB1, rC11
766
movaps 160(pB0,ldb), rB1
767
#elif KB == 72 && defined(BETA1)
769
vfmaddps rC02, ra0, rB2, rC02
770
#if KB == 72 && defined(BETA1)
772
vfmaddps rC12, ra1, rB2, rC12
774
movaps 160(pB0,ldb,2), rB2
775
#elif KB == 72 && defined(BETA1)
777
vfmaddps rC03, ra0, rB3, rC03
778
#if KB == 72 && defined(BETA1)
780
vfmaddps rC13, ra1, rB3, rC13
782
movaps 160(pB0,ldb3), rB3
783
#elif KB == 72 && defined(BETA1)
787
vfmaddps rC00, rA0, rB0, rC00
791
vfmaddps rC10, rA1, rB0, rC10
793
movaps 176(pA0,lda), ra1
795
vfmaddps rC01, rA0, rB1, rC01
798
#elif KB == 76 && defined(BETA1)
800
vfmaddps rC11, rA1, rB1, rC11
802
movaps 176(pB0,ldb), rB1
803
#elif KB == 76 && defined(BETA1)
805
vfmaddps rC02, rA0, rB2, rC02
806
#if KB == 76 && defined(BETA0)
808
vfmaddps rC12, rA1, rB2, rC12
810
movaps 176(pB0,ldb,2), rB2
811
#elif KB == 76 && defined(BETA1)
813
vfmaddps rC03, rA0, rB3, rC03
814
#if KB == 76 && defined(BETA1)
816
vfmaddps rC13, rA1, rB3, rC13
818
movaps 176(pB0,ldb3), rB3
819
#elif KB == 76 && defined(BETA0)
823
vfmaddps rC00, ra0, rB0, rC00
827
vfmaddps rC10, ra1, rB0, rC10
829
movaps 192(pA0,lda), rA1
831
vfmaddps rC01, ra0, rB1, rC01
834
#elif KB == 80 && defined(BETA1)
836
vfmaddps rC11, ra1, rB1, rC11
838
movaps 192(pB0,ldb), rB1
839
#elif KB == 80 && defined(BETA1)
841
vfmaddps rC02, ra0, rB2, rC02
842
#if KB == 80 && defined(BETA1)
844
vfmaddps rC12, ra1, rB2, rC12
846
movaps 192(pB0,ldb,2), rB2
847
#elif KB == 80 && defined(BETA1)
849
vfmaddps rC03, ra0, rB3, rC03
850
#if KB == 80 && defined(BETA1)
852
vfmaddps rC13, ra1, rB3, rC13
854
movaps 192(pB0,ldb3), rB3
855
#elif KB == 80 && defined(BETA1)
859
vfmaddps rC00, rA0, rB0, rC00
863
vfmaddps rC10, rA1, rB0, rC10
865
movaps 208(pA0,lda), ra1
867
vfmaddps rC01, rA0, rB1, rC01
870
#elif KB == 84 && defined(BETA1)
872
vfmaddps rC11, rA1, rB1, rC11
874
movaps 208(pB0,ldb), rB1
875
#elif KB == 84 && defined(BETA1)
877
vfmaddps rC02, rA0, rB2, rC02
878
#if KB == 84 && defined(BETA0)
880
vfmaddps rC12, rA1, rB2, rC12
882
movaps 208(pB0,ldb,2), rB2
883
#elif KB == 84 && defined(BETA1)
885
vfmaddps rC03, rA0, rB3, rC03
886
#if KB == 84 && defined(BETA1)
888
vfmaddps rC13, rA1, rB3, rC13
890
movaps 208(pB0,ldb3), rB3
891
#elif KB == 84 && defined(BETA0)
895
vfmaddps rC00, ra0, rB0, rC00
899
vfmaddps rC10, ra1, rB0, rC10
901
movaps 224(pA0,lda), rA1
903
vfmaddps rC01, ra0, rB1, rC01
906
#elif KB == 88 && defined(BETA1)
908
vfmaddps rC11, ra1, rB1, rC11
910
movaps 224(pB0,ldb), rB1
911
#elif KB == 88 && defined(BETA1)
913
vfmaddps rC02, ra0, rB2, rC02
914
#if KB == 88 && defined(BETA1)
916
vfmaddps rC12, ra1, rB2, rC12
918
movaps 224(pB0,ldb,2), rB2
919
#elif KB == 88 && defined(BETA1)
921
vfmaddps rC03, ra0, rB3, rC03
922
#if KB == 88 && defined(BETA1)
924
vfmaddps rC13, ra1, rB3, rC13
926
movaps 224(pB0,ldb3), rB3
927
#elif KB == 88 && defined(BETA1)
931
vfmaddps rC00, rA0, rB0, rC00
935
vfmaddps rC10, rA1, rB0, rC10
937
movaps 240(pA0,lda), ra1
939
vfmaddps rC01, rA0, rB1, rC01
942
#elif KB == 92 && defined(BETA1)
944
vfmaddps rC11, rA1, rB1, rC11
946
movaps 240(pB0,ldb), rB1
947
#elif KB == 92 && defined(BETA1)
949
vfmaddps rC02, rA0, rB2, rC02
950
#if KB == 92 && defined(BETA0)
952
vfmaddps rC12, rA1, rB2, rC12
954
movaps 240(pB0,ldb,2), rB2
955
#elif KB == 92 && defined(BETA1)
957
vfmaddps rC03, rA0, rB3, rC03
958
#if KB == 92 && defined(BETA1)
960
vfmaddps rC13, rA1, rB3, rC13
962
movaps 240(pB0,ldb3), rB3
963
#elif KB == 92 && defined(BETA0)
967
vfmaddps rC00, ra0, rB0, rC00
971
vfmaddps rC10, ra1, rB0, rC10
973
movaps 256(pA0,lda), rA1
975
vfmaddps rC01, ra0, rB1, rC01
978
#elif KB == 96 && defined(BETA1)
980
vfmaddps rC11, ra1, rB1, rC11
982
movaps 256(pB0,ldb), rB1
983
#elif KB == 96 && defined(BETA1)
985
vfmaddps rC02, ra0, rB2, rC02
986
#if KB == 96 && defined(BETA1)
988
vfmaddps rC12, ra1, rB2, rC12
990
movaps 256(pB0,ldb,2), rB2
991
#elif KB == 96 && defined(BETA1)
993
vfmaddps rC03, ra0, rB3, rC03
994
#if KB == 96 && defined(BETA1)
996
vfmaddps rC13, ra1, rB3, rC13
998
movaps 256(pB0,ldb3), rB3
999
#elif KB == 96 && defined(BETA1)
1003
vfmaddps rC00, rA0, rB0, rC00
1005
movaps 272(pA0), ra0
1007
vfmaddps rC10, rA1, rB0, rC10
1009
movaps 272(pA0,lda), ra1
1011
vfmaddps rC01, rA0, rB1, rC01
1013
movaps 272(pB0), rB0
1014
#elif KB == 100 && defined(BETA1)
1016
vfmaddps rC11, rA1, rB1, rC11
1018
movaps 272(pB0,ldb), rB1
1019
#elif KB == 100 && defined(BETA1)
1021
vfmaddps rC02, rA0, rB2, rC02
1022
#if KB == 100 && defined(BETA0)
1024
vfmaddps rC12, rA1, rB2, rC12
1026
movaps 272(pB0,ldb,2), rB2
1027
#elif KB == 100 && defined(BETA1)
1029
vfmaddps rC03, rA0, rB3, rC03
1030
#if KB == 100 && defined(BETA1)
1032
vfmaddps rC13, rA1, rB3, rC13
1034
movaps 272(pB0,ldb3), rB3
1035
#elif KB == 100 && defined(BETA0)
1039
vfmaddps rC00, ra0, rB0, rC00
1041
movaps 288(pA0), rA0
1043
vfmaddps rC10, ra1, rB0, rC10
1045
movaps 288(pA0,lda), rA1
1047
vfmaddps rC01, ra0, rB1, rC01
1049
movaps 288(pB0), rB0
1050
#elif KB == 104 && defined(BETA1)
1052
vfmaddps rC11, ra1, rB1, rC11
1054
movaps 288(pB0,ldb), rB1
1055
#elif KB == 104 && defined(BETA1)
1057
vfmaddps rC02, ra0, rB2, rC02
1058
#if KB == 104 && defined(BETA1)
1060
vfmaddps rC12, ra1, rB2, rC12
1062
movaps 288(pB0,ldb,2), rB2
1063
#elif KB == 104 && defined(BETA1)
1065
vfmaddps rC03, ra0, rB3, rC03
1066
#if KB == 104 && defined(BETA1)
1068
vfmaddps rC13, ra1, rB3, rC13
1070
movaps 288(pB0,ldb3), rB3
1071
#elif KB == 104 && defined(BETA1)
1075
vfmaddps rC00, rA0, rB0, rC00
1077
movaps 304(pA0), ra0
1079
vfmaddps rC10, rA1, rB0, rC10
1081
movaps 304(pA0,lda), ra1
1083
vfmaddps rC01, rA0, rB1, rC01
1085
movaps 304(pB0), rB0
1086
#elif KB == 108 && defined(BETA1)
1088
vfmaddps rC11, rA1, rB1, rC11
1090
movaps 304(pB0,ldb), rB1
1091
#elif KB == 108 && defined(BETA1)
1093
vfmaddps rC02, rA0, rB2, rC02
1094
#if KB == 108 && defined(BETA0)
1096
vfmaddps rC12, rA1, rB2, rC12
1098
movaps 304(pB0,ldb,2), rB2
1099
#elif KB == 108 && defined(BETA1)
1101
vfmaddps rC03, rA0, rB3, rC03
1102
#if KB == 108 && defined(BETA1)
1104
vfmaddps rC13, rA1, rB3, rC13
1106
movaps 304(pB0,ldb3), rB3
1107
#elif KB == 108 && defined(BETA0)
1111
vfmaddps rC00, ra0, rB0, rC00
1113
movaps 320(pA0), rA0
1115
vfmaddps rC10, ra1, rB0, rC10
1117
movaps 320(pA0,lda), rA1
1119
vfmaddps rC01, ra0, rB1, rC01
1121
movaps 320(pB0), rB0
1122
#elif KB == 112 && defined(BETA1)
1124
vfmaddps rC11, ra1, rB1, rC11
1126
movaps 320(pB0,ldb), rB1
1127
#elif KB == 112 && defined(BETA1)
1129
vfmaddps rC02, ra0, rB2, rC02
1130
#if KB == 112 && defined(BETA1)
1132
vfmaddps rC12, ra1, rB2, rC12
1134
movaps 320(pB0,ldb,2), rB2
1135
#elif KB == 112 && defined(BETA1)
1137
vfmaddps rC03, ra0, rB3, rC03
1138
#if KB == 112 && defined(BETA1)
1140
vfmaddps rC13, ra1, rB3, rC13
1142
movaps 320(pB0,ldb3), rB3
1143
#elif KB == 112 && defined(BETA1)
1147
vfmaddps rC00, rA0, rB0, rC00
1149
movaps 336(pA0), ra0
1151
vfmaddps rC10, rA1, rB0, rC10
1153
movaps 336(pA0,lda), ra1
1155
vfmaddps rC01, rA0, rB1, rC01
1157
movaps 336(pB0), rB0
1158
#elif KB == 116 && defined(BETA1)
1160
vfmaddps rC11, rA1, rB1, rC11
1162
movaps 336(pB0,ldb), rB1
1163
#elif KB == 116 && defined(BETA1)
1165
vfmaddps rC02, rA0, rB2, rC02
1166
#if KB == 116 && defined(BETA0)
1168
vfmaddps rC12, rA1, rB2, rC12
1170
movaps 336(pB0,ldb,2), rB2
1171
#elif KB == 116 && defined(BETA1)
1173
vfmaddps rC03, rA0, rB3, rC03
1174
#if KB == 116 && defined(BETA1)
1176
vfmaddps rC13, rA1, rB3, rC13
1178
movaps 336(pB0,ldb3), rB3
1179
#elif KB == 116 && defined(BETA0)
1183
vfmaddps rC00, ra0, rB0, rC00
1185
movaps 352(pA0), rA0
1187
vfmaddps rC10, ra1, rB0, rC10
1189
movaps 352(pA0,lda), rA1
1191
vfmaddps rC01, ra0, rB1, rC01
1193
movaps 352(pB0), rB0
1194
#elif KB == 120 && defined(BETA1)
1196
vfmaddps rC11, ra1, rB1, rC11
1198
movaps 352(pB0,ldb), rB1
1199
#elif KB == 120 && defined(BETA1)
1201
vfmaddps rC02, ra0, rB2, rC02
1202
#if KB == 120 && defined(BETA1)
1204
vfmaddps rC12, ra1, rB2, rC12
1206
movaps 352(pB0,ldb,2), rB2
1207
#elif KB == 120 && defined(BETA1)
1209
vfmaddps rC03, ra0, rB3, rC03
1210
#if KB == 120 && defined(BETA1)
1212
vfmaddps rC13, ra1, rB3, rC13
1214
movaps 352(pB0,ldb3), rB3
1215
#elif KB == 120 && defined(BETA1)
1219
vfmaddps rC00, rA0, rB0, rC00
1221
movaps 368(pA0), ra0
1223
vfmaddps rC10, rA1, rB0, rC10
1225
movaps 368(pA0,lda), ra1
1227
vfmaddps rC01, rA0, rB1, rC01
1229
movaps 368(pB0), rB0
1230
#elif KB == 124 && defined(BETA1)
1232
vfmaddps rC11, rA1, rB1, rC11
1234
movaps 368(pB0,ldb), rB1
1235
#elif KB == 124 && defined(BETA1)
1237
vfmaddps rC02, rA0, rB2, rC02
1238
#if KB == 124 && defined(BETA0)
1240
vfmaddps rC12, rA1, rB2, rC12
1242
movaps 368(pB0,ldb,2), rB2
1243
#elif KB == 124 && defined(BETA1)
1245
vfmaddps rC03, rA0, rB3, rC03
1246
#if KB == 124 && defined(BETA1)
1248
vfmaddps rC13, rA1, rB3, rC13
1250
movaps 368(pB0,ldb3), rB3
1251
#elif KB == 124 && defined(BETA0)
1255
vfmaddps rC00, ra0, rB0, rC00
1257
movaps 384(pA0), rA0
1259
vfmaddps rC10, ra1, rB0, rC10
1261
movaps 384(pA0,lda), rA1
1263
vfmaddps rC01, ra0, rB1, rC01
1265
movaps 384(pB0), rB0
1266
#elif KB == 128 && defined(BETA1)
1268
vfmaddps rC11, ra1, rB1, rC11
1270
movaps 384(pB0,ldb), rB1
1271
#elif KB == 128 && defined(BETA1)
1273
vfmaddps rC02, ra0, rB2, rC02
1274
#if KB == 128 && defined(BETA1)
1276
vfmaddps rC12, ra1, rB2, rC12
1278
movaps 384(pB0,ldb,2), rB2
1279
#elif KB == 128 && defined(BETA1)
1281
vfmaddps rC03, ra0, rB3, rC03
1282
#if KB == 128 && defined(BETA1)
1284
vfmaddps rC13, ra1, rB3, rC13
1286
movaps 384(pB0,ldb3), rB3
1287
#elif KB == 128 && defined(BETA1)
1291
vfmaddps rC00, rA0, rB0, rC00
1293
movaps 400(pA0), ra0
1295
vfmaddps rC10, rA1, rB0, rC10
1297
movaps 400(pA0,lda), ra1
1299
vfmaddps rC01, rA0, rB1, rC01
1301
movaps 400(pB0), rB0
1302
#elif KB == 132 && defined(BETA1)
1304
vfmaddps rC11, rA1, rB1, rC11
1306
movaps 400(pB0,ldb), rB1
1307
#elif KB == 132 && defined(BETA1)
1309
vfmaddps rC02, rA0, rB2, rC02
1310
#if KB == 132 && defined(BETA0)
1312
vfmaddps rC12, rA1, rB2, rC12
1314
movaps 400(pB0,ldb,2), rB2
1315
#elif KB == 132 && defined(BETA1)
1317
vfmaddps rC03, rA0, rB3, rC03
1318
#if KB == 132 && defined(BETA1)
1320
vfmaddps rC13, rA1, rB3, rC13
1322
movaps 400(pB0,ldb3), rB3
1323
#elif KB == 132 && defined(BETA0)
1327
vfmaddps rC00, ra0, rB0, rC00
1329
movaps 416(pA0), rA0
1331
vfmaddps rC10, ra1, rB0, rC10
1333
movaps 416(pA0,lda), rA1
1335
vfmaddps rC01, ra0, rB1, rC01
1337
movaps 416(pB0), rB0
1338
#elif KB == 136 && defined(BETA1)
1340
vfmaddps rC11, ra1, rB1, rC11
1342
movaps 416(pB0,ldb), rB1
1343
#elif KB == 136 && defined(BETA1)
1345
vfmaddps rC02, ra0, rB2, rC02
1346
#if KB == 136 && defined(BETA1)
1348
vfmaddps rC12, ra1, rB2, rC12
1350
movaps 416(pB0,ldb,2), rB2
1351
#elif KB == 136 && defined(BETA1)
1353
vfmaddps rC03, ra0, rB3, rC03
1354
#if KB == 136 && defined(BETA1)
1356
vfmaddps rC13, ra1, rB3, rC13
1358
movaps 416(pB0,ldb3), rB3
1359
#elif KB == 136 && defined(BETA1)
1363
vfmaddps rC00, rA0, rB0, rC00
1365
movaps 432(pA0), ra0
1367
vfmaddps rC10, rA1, rB0, rC10
1369
movaps 432(pA0,lda), ra1
1371
vfmaddps rC01, rA0, rB1, rC01
1373
movaps 432(pB0), rB0
1374
#elif KB == 140 && defined(BETA1)
1376
vfmaddps rC11, rA1, rB1, rC11
1378
movaps 432(pB0,ldb), rB1
1379
#elif KB == 140 && defined(BETA1)
1381
vfmaddps rC02, rA0, rB2, rC02
1382
#if KB == 140 && defined(BETA0)
1384
vfmaddps rC12, rA1, rB2, rC12
1386
movaps 432(pB0,ldb,2), rB2
1387
#elif KB == 140 && defined(BETA1)
1389
vfmaddps rC03, rA0, rB3, rC03
1390
#if KB == 140 && defined(BETA1)
1392
vfmaddps rC13, rA1, rB3, rC13
1394
movaps 432(pB0,ldb3), rB3
1395
#elif KB == 140 && defined(BETA0)
1399
vfmaddps rC00, ra0, rB0, rC00
1401
movaps 448(pA0), rA0
1403
vfmaddps rC10, ra1, rB0, rC10
1405
movaps 448(pA0,lda), rA1
1407
vfmaddps rC01, ra0, rB1, rC01
1409
movaps 448(pB0), rB0
1410
#elif KB == 144 && defined(BETA1)
1412
vfmaddps rC11, ra1, rB1, rC11
1414
movaps 448(pB0,ldb), rB1
1415
#elif KB == 144 && defined(BETA1)
1417
vfmaddps rC02, ra0, rB2, rC02
1418
#if KB == 144 && defined(BETA1)
1420
vfmaddps rC12, ra1, rB2, rC12
1422
movaps 448(pB0,ldb,2), rB2
1423
#elif KB == 144 && defined(BETA1)
1425
vfmaddps rC03, ra0, rB3, rC03
1426
#if KB == 144 && defined(BETA1)
1428
vfmaddps rC13, ra1, rB3, rC13
1430
movaps 448(pB0,ldb3), rB3
1431
#elif KB == 144 && defined(BETA1)
1435
vfmaddps rC00, rA0, rB0, rC00
1437
movaps 464(pA0), ra0
1439
vfmaddps rC10, rA1, rB0, rC10
1441
movaps 464(pA0,lda), ra1
1443
vfmaddps rC01, rA0, rB1, rC01
1445
movaps 464(pB0), rB0
1446
#elif KB == 148 && defined(BETA1)
1448
vfmaddps rC11, rA1, rB1, rC11
1450
movaps 464(pB0,ldb), rB1
1451
#elif KB == 148 && defined(BETA1)
1453
vfmaddps rC02, rA0, rB2, rC02
1454
#if KB == 148 && defined(BETA0)
1456
vfmaddps rC12, rA1, rB2, rC12
1458
movaps 464(pB0,ldb,2), rB2
1459
#elif KB == 148 && defined(BETA1)
1461
vfmaddps rC03, rA0, rB3, rC03
1462
#if KB == 148 && defined(BETA1)
1464
vfmaddps rC13, rA1, rB3, rC13
1466
movaps 464(pB0,ldb3), rB3
1467
#elif KB == 148 && defined(BETA0)
1471
vfmaddps rC00, ra0, rB0, rC00
1473
movaps 480(pA0), rA0
1475
vfmaddps rC10, ra1, rB0, rC10
1477
movaps 480(pA0,lda), rA1
1479
vfmaddps rC01, ra0, rB1, rC01
1481
movaps 480(pB0), rB0
1482
#elif KB == 152 && defined(BETA1)
1484
vfmaddps rC11, ra1, rB1, rC11
1486
movaps 480(pB0,ldb), rB1
1487
#elif KB == 152 && defined(BETA1)
1489
vfmaddps rC02, ra0, rB2, rC02
1490
#if KB == 152 && defined(BETA1)
1492
vfmaddps rC12, ra1, rB2, rC12
1494
movaps 480(pB0,ldb,2), rB2
1495
#elif KB == 152 && defined(BETA1)
1497
vfmaddps rC03, ra0, rB3, rC03
1498
#if KB == 152 && defined(BETA1)
1500
vfmaddps rC13, ra1, rB3, rC13
1502
movaps 480(pB0,ldb3), rB3
1503
#elif KB == 152 && defined(BETA1)
1507
vfmaddps rC00, rA0, rB0, rC00
1509
movaps 496(pA0), ra0
1511
vfmaddps rC10, rA1, rB0, rC10
1513
movaps 496(pA0,lda), ra1
1515
vfmaddps rC01, rA0, rB1, rC01
1517
movaps 496(pB0), rB0
1518
#elif KB == 156 && defined(BETA1)
1520
vfmaddps rC11, rA1, rB1, rC11
1522
movaps 496(pB0,ldb), rB1
1523
#elif KB == 156 && defined(BETA1)
1525
vfmaddps rC02, rA0, rB2, rC02
1526
#if KB == 156 && defined(BETA0)
1528
vfmaddps rC12, rA1, rB2, rC12
1530
movaps 496(pB0,ldb,2), rB2
1531
#elif KB == 156 && defined(BETA1)
1533
vfmaddps rC03, rA0, rB3, rC03
1534
#if KB == 156 && defined(BETA1)
1536
vfmaddps rC13, rA1, rB3, rC13
1538
movaps 496(pB0,ldb3), rB3
1539
#elif KB == 156 && defined(BETA0)
1543
vfmaddps rC00, ra0, rB0, rC00
1545
movaps 512(pA0), rA0
1547
vfmaddps rC10, ra1, rB0, rC10
1549
movaps 512(pA0,lda), rA1
1551
vfmaddps rC01, ra0, rB1, rC01
1553
movaps 512(pB0), rB0
1554
#elif KB == 160 && defined(BETA1)
1556
vfmaddps rC11, ra1, rB1, rC11
1558
movaps 512(pB0,ldb), rB1
1559
#elif KB == 160 && defined(BETA1)
1561
vfmaddps rC02, ra0, rB2, rC02
1562
#if KB == 160 && defined(BETA1)
1564
vfmaddps rC12, ra1, rB2, rC12
1566
movaps 512(pB0,ldb,2), rB2
1567
#elif KB == 160 && defined(BETA1)
1569
vfmaddps rC03, ra0, rB3, rC03
1570
#if KB == 160 && defined(BETA1)
1572
vfmaddps rC13, ra1, rB3, rC13
1574
movaps 512(pB0,ldb3), rB3
1575
#elif KB == 160 && defined(BETA1)
1579
vfmaddps rC00, rA0, rB0, rC00
1581
movaps 528(pA0), ra0
1583
vfmaddps rC10, rA1, rB0, rC10
1585
movaps 528(pA0,lda), ra1
1587
vfmaddps rC01, rA0, rB1, rC01
1589
movaps 528(pB0), rB0
1590
#elif KB == 164 && defined(BETA1)
1592
vfmaddps rC11, rA1, rB1, rC11
1594
movaps 528(pB0,ldb), rB1
1595
#elif KB == 164 && defined(BETA1)
1597
vfmaddps rC02, rA0, rB2, rC02
1598
#if KB == 164 && defined(BETA0)
1600
vfmaddps rC12, rA1, rB2, rC12
1602
movaps 528(pB0,ldb,2), rB2
1603
#elif KB == 164 && defined(BETA1)
1605
vfmaddps rC03, rA0, rB3, rC03
1606
#if KB == 164 && defined(BETA1)
1608
vfmaddps rC13, rA1, rB3, rC13
1610
movaps 528(pB0,ldb3), rB3
1611
#elif KB == 164 && defined(BETA0)
1615
vfmaddps rC00, ra0, rB0, rC00
1617
movaps 544(pA0), rA0
1619
vfmaddps rC10, ra1, rB0, rC10
1621
movaps 544(pA0,lda), rA1
1623
vfmaddps rC01, ra0, rB1, rC01
1625
movaps 544(pB0), rB0
1626
#elif KB == 168 && defined(BETA1)
1628
vfmaddps rC11, ra1, rB1, rC11
1630
movaps 544(pB0,ldb), rB1
1631
#elif KB == 168 && defined(BETA1)
1633
vfmaddps rC02, ra0, rB2, rC02
1634
#if KB == 168 && defined(BETA1)
1636
vfmaddps rC12, ra1, rB2, rC12
1638
movaps 544(pB0,ldb,2), rB2
1639
#elif KB == 168 && defined(BETA1)
1641
vfmaddps rC03, ra0, rB3, rC03
1642
#if KB == 168 && defined(BETA1)
1644
vfmaddps rC13, ra1, rB3, rC13
1646
movaps 544(pB0,ldb3), rB3
1647
#elif KB == 168 && defined(BETA1)
1651
vfmaddps rC00, rA0, rB0, rC00
1653
movaps 560(pA0), ra0
1655
vfmaddps rC10, rA1, rB0, rC10
1657
movaps 560(pA0,lda), ra1
1659
vfmaddps rC01, rA0, rB1, rC01
1661
movaps 560(pB0), rB0
1662
#elif KB == 172 && defined(BETA1)
1664
vfmaddps rC11, rA1, rB1, rC11
1666
movaps 560(pB0,ldb), rB1
1667
#elif KB == 172 && defined(BETA1)
1669
vfmaddps rC02, rA0, rB2, rC02
1670
#if KB == 172 && defined(BETA0)
1672
vfmaddps rC12, rA1, rB2, rC12
1674
movaps 560(pB0,ldb,2), rB2
1675
#elif KB == 172 && defined(BETA1)
1677
vfmaddps rC03, rA0, rB3, rC03
1678
#if KB == 172 && defined(BETA1)
1680
vfmaddps rC13, rA1, rB3, rC13
1682
movaps 560(pB0,ldb3), rB3
1683
#elif KB == 172 && defined(BETA0)
1687
vfmaddps rC00, ra0, rB0, rC00
1689
movaps 576(pA0), rA0
1691
vfmaddps rC10, ra1, rB0, rC10
1693
movaps 576(pA0,lda), rA1
1695
vfmaddps rC01, ra0, rB1, rC01
1697
movaps 576(pB0), rB0
1698
#elif KB == 176 && defined(BETA1)
1700
vfmaddps rC11, ra1, rB1, rC11
1702
movaps 576(pB0,ldb), rB1
1703
#elif KB == 176 && defined(BETA1)
1705
vfmaddps rC02, ra0, rB2, rC02
1706
#if KB == 176 && defined(BETA1)
1708
vfmaddps rC12, ra1, rB2, rC12
1710
movaps 576(pB0,ldb,2), rB2
1711
#elif KB == 176 && defined(BETA1)
1713
vfmaddps rC03, ra0, rB3, rC03
1714
#if KB == 176 && defined(BETA1)
1716
vfmaddps rC13, ra1, rB3, rC13
1718
movaps 576(pB0,ldb3), rB3
1719
#elif KB == 176 && defined(BETA1)
1723
vfmaddps rC00, rA0, rB0, rC00
1725
movaps 592(pA0), ra0
1727
vfmaddps rC10, rA1, rB0, rC10
1729
movaps 592(pA0,lda), ra1
1731
vfmaddps rC01, rA0, rB1, rC01
1733
movaps 592(pB0), rB0
1734
#elif KB == 180 && defined(BETA1)
1736
vfmaddps rC11, rA1, rB1, rC11
1738
movaps 592(pB0,ldb), rB1
1739
#elif KB == 180 && defined(BETA1)
1741
vfmaddps rC02, rA0, rB2, rC02
1742
#if KB == 180 && defined(BETA0)
1744
vfmaddps rC12, rA1, rB2, rC12
1746
movaps 592(pB0,ldb,2), rB2
1747
#elif KB == 180 && defined(BETA1)
1749
vfmaddps rC03, rA0, rB3, rC03
1750
#if KB == 180 && defined(BETA1)
1752
vfmaddps rC13, rA1, rB3, rC13
1754
movaps 592(pB0,ldb3), rB3
1755
#elif KB == 180 && defined(BETA0)
1759
vfmaddps rC00, ra0, rB0, rC00
1761
movaps 608(pA0), rA0
1763
vfmaddps rC10, ra1, rB0, rC10
1765
movaps 608(pA0,lda), rA1
1767
vfmaddps rC01, ra0, rB1, rC01
1769
movaps 608(pB0), rB0
1770
#elif KB == 184 && defined(BETA1)
1772
vfmaddps rC11, ra1, rB1, rC11
1774
movaps 608(pB0,ldb), rB1
1775
#elif KB == 184 && defined(BETA1)
1777
vfmaddps rC02, ra0, rB2, rC02
1778
#if KB == 184 && defined(BETA1)
1780
vfmaddps rC12, ra1, rB2, rC12
1782
movaps 608(pB0,ldb,2), rB2
1783
#elif KB == 184 && defined(BETA1)
1785
vfmaddps rC03, ra0, rB3, rC03
1786
#if KB == 184 && defined(BETA1)
1788
vfmaddps rC13, ra1, rB3, rC13
1790
movaps 608(pB0,ldb3), rB3
1791
#elif KB == 184 && defined(BETA1)
1795
vfmaddps rC00, rA0, rB0, rC00
1797
movaps 624(pA0), ra0
1799
vfmaddps rC10, rA1, rB0, rC10
1801
movaps 624(pA0,lda), ra1
1803
vfmaddps rC01, rA0, rB1, rC01
1805
movaps 624(pB0), rB0
1806
#elif KB == 188 && defined(BETA1)
1808
vfmaddps rC11, rA1, rB1, rC11
1810
movaps 624(pB0,ldb), rB1
1811
#elif KB == 188 && defined(BETA1)
1813
vfmaddps rC02, rA0, rB2, rC02
1814
#if KB == 188 && defined(BETA0)
1816
vfmaddps rC12, rA1, rB2, rC12
1818
movaps 624(pB0,ldb,2), rB2
1819
#elif KB == 188 && defined(BETA1)
1821
vfmaddps rC03, rA0, rB3, rC03
1822
#if KB == 188 && defined(BETA1)
1824
vfmaddps rC13, rA1, rB3, rC13
1826
movaps 624(pB0,ldb3), rB3
1827
#elif KB == 188 && defined(BETA0)
1831
vfmaddps rC00, ra0, rB0, rC00
1833
movaps 640(pA0), rA0
1835
vfmaddps rC10, ra1, rB0, rC10
1837
movaps 640(pA0,lda), rA1
1839
vfmaddps rC01, ra0, rB1, rC01
1841
movaps 640(pB0), rB0
1842
#elif KB == 192 && defined(BETA1)
1844
vfmaddps rC11, ra1, rB1, rC11
1846
movaps 640(pB0,ldb), rB1
1847
#elif KB == 192 && defined(BETA1)
1849
vfmaddps rC02, ra0, rB2, rC02
1850
#if KB == 192 && defined(BETA1)
1852
vfmaddps rC12, ra1, rB2, rC12
1854
movaps 640(pB0,ldb,2), rB2
1855
#elif KB == 192 && defined(BETA1)
1857
vfmaddps rC03, ra0, rB3, rC03
1858
#if KB == 192 && defined(BETA1)
1860
vfmaddps rC13, ra1, rB3, rC13
1862
movaps 640(pB0,ldb3), rB3
1863
#elif KB == 192 && defined(BETA1)
1867
vfmaddps rC00, rA0, rB0, rC00
1869
movaps 656(pA0), ra0
1871
vfmaddps rC10, rA1, rB0, rC10
1873
movaps 656(pA0,lda), ra1
1875
vfmaddps rC01, rA0, rB1, rC01
1877
movaps 656(pB0), rB0
1878
#elif KB == 196 && defined(BETA1)
1880
vfmaddps rC11, rA1, rB1, rC11
1882
movaps 656(pB0,ldb), rB1
1883
#elif KB == 196 && defined(BETA1)
1885
vfmaddps rC02, rA0, rB2, rC02
1886
#if KB == 196 && defined(BETA0)
1888
vfmaddps rC12, rA1, rB2, rC12
1890
movaps 656(pB0,ldb,2), rB2
1891
#elif KB == 196 && defined(BETA1)
1893
vfmaddps rC03, rA0, rB3, rC03
1894
#if KB == 196 && defined(BETA1)
1896
vfmaddps rC13, rA1, rB3, rC13
1898
movaps 656(pB0,ldb3), rB3
1899
#elif KB == 196 && defined(BETA0)
1903
vfmaddps rC00, ra0, rB0, rC00
1905
movaps 672(pA0), rA0
1907
vfmaddps rC10, ra1, rB0, rC10
1909
movaps 672(pA0,lda), rA1
1911
vfmaddps rC01, ra0, rB1, rC01
1913
movaps 672(pB0), rB0
1914
#elif KB == 200 && defined(BETA1)
1916
vfmaddps rC11, ra1, rB1, rC11
1918
movaps 672(pB0,ldb), rB1
1919
#elif KB == 200 && defined(BETA1)
1921
vfmaddps rC02, ra0, rB2, rC02
1922
#if KB == 200 && defined(BETA1)
1924
vfmaddps rC12, ra1, rB2, rC12
1926
movaps 672(pB0,ldb,2), rB2
1927
#elif KB == 200 && defined(BETA1)
1929
vfmaddps rC03, ra0, rB3, rC03
1930
#if KB == 200 && defined(BETA1)
1932
vfmaddps rC13, ra1, rB3, rC13
1934
movaps 672(pB0,ldb3), rB3
1935
#elif KB == 200 && defined(BETA1)
1939
vfmaddps rC00, rA0, rB0, rC00
1941
movaps 688(pA0), ra0
1943
vfmaddps rC10, rA1, rB0, rC10
1945
movaps 688(pA0,lda), ra1
1947
vfmaddps rC01, rA0, rB1, rC01
1949
movaps 688(pB0), rB0
1950
#elif KB == 204 && defined(BETA1)
1952
vfmaddps rC11, rA1, rB1, rC11
1954
movaps 688(pB0,ldb), rB1
1955
#elif KB == 204 && defined(BETA1)
1957
vfmaddps rC02, rA0, rB2, rC02
1958
#if KB == 204 && defined(BETA0)
1960
vfmaddps rC12, rA1, rB2, rC12
1962
movaps 688(pB0,ldb,2), rB2
1963
#elif KB == 204 && defined(BETA1)
1965
vfmaddps rC03, rA0, rB3, rC03
1966
#if KB == 204 && defined(BETA1)
1968
vfmaddps rC13, rA1, rB3, rC13
1970
movaps 688(pB0,ldb3), rB3
1971
#elif KB == 204 && defined(BETA0)
1975
vfmaddps rC00, ra0, rB0, rC00
1977
movaps 704(pA0), rA0
1979
vfmaddps rC10, ra1, rB0, rC10
1981
movaps 704(pA0,lda), rA1
1983
vfmaddps rC01, ra0, rB1, rC01
1985
movaps 704(pB0), rB0
1986
#elif KB == 208 && defined(BETA1)
1988
vfmaddps rC11, ra1, rB1, rC11
1990
movaps 704(pB0,ldb), rB1
1991
#elif KB == 208 && defined(BETA1)
1993
vfmaddps rC02, ra0, rB2, rC02
1994
#if KB == 208 && defined(BETA1)
1996
vfmaddps rC12, ra1, rB2, rC12
1998
movaps 704(pB0,ldb,2), rB2
1999
#elif KB == 208 && defined(BETA1)
2001
vfmaddps rC03, ra0, rB3, rC03
2002
#if KB == 208 && defined(BETA1)
2004
vfmaddps rC13, ra1, rB3, rC13
2006
movaps 704(pB0,ldb3), rB3
2007
#elif KB == 208 && defined(BETA1)
2011
vfmaddps rC00, rA0, rB0, rC00
2013
movaps 720(pA0), ra0
2015
vfmaddps rC10, rA1, rB0, rC10
2017
movaps 720(pA0,lda), ra1
2019
vfmaddps rC01, rA0, rB1, rC01
2021
movaps 720(pB0), rB0
2022
#elif KB == 212 && defined(BETA1)
2024
vfmaddps rC11, rA1, rB1, rC11
2026
movaps 720(pB0,ldb), rB1
2027
#elif KB == 212 && defined(BETA1)
2029
vfmaddps rC02, rA0, rB2, rC02
2030
#if KB == 212 && defined(BETA0)
2032
vfmaddps rC12, rA1, rB2, rC12
2034
movaps 720(pB0,ldb,2), rB2
2035
#elif KB == 212 && defined(BETA1)
2037
vfmaddps rC03, rA0, rB3, rC03
2038
#if KB == 212 && defined(BETA1)
2040
vfmaddps rC13, rA1, rB3, rC13
2042
movaps 720(pB0,ldb3), rB3
2043
#elif KB == 212 && defined(BETA0)
2047
vfmaddps rC00, ra0, rB0, rC00
2049
movaps 736(pA0), rA0
2051
vfmaddps rC10, ra1, rB0, rC10
2053
movaps 736(pA0,lda), rA1
2055
vfmaddps rC01, ra0, rB1, rC01
2057
movaps 736(pB0), rB0
2058
#elif KB == 216 && defined(BETA1)
2060
vfmaddps rC11, ra1, rB1, rC11
2062
movaps 736(pB0,ldb), rB1
2063
#elif KB == 216 && defined(BETA1)
2065
vfmaddps rC02, ra0, rB2, rC02
2066
#if KB == 216 && defined(BETA1)
2068
vfmaddps rC12, ra1, rB2, rC12
2070
movaps 736(pB0,ldb,2), rB2
2071
#elif KB == 216 && defined(BETA1)
2073
vfmaddps rC03, ra0, rB3, rC03
2074
#if KB == 216 && defined(BETA1)
2076
vfmaddps rC13, ra1, rB3, rC13
2078
movaps 736(pB0,ldb3), rB3
2079
#elif KB == 216 && defined(BETA1)
2083
vfmaddps rC00, rA0, rB0, rC00
2085
movaps 752(pA0), ra0
2087
vfmaddps rC10, rA1, rB0, rC10
2089
movaps 752(pA0,lda), ra1
2091
vfmaddps rC01, rA0, rB1, rC01
2093
movaps 752(pB0), rB0
2094
#elif KB == 220 && defined(BETA1)
2096
vfmaddps rC11, rA1, rB1, rC11
2098
movaps 752(pB0,ldb), rB1
2099
#elif KB == 220 && defined(BETA1)
2101
vfmaddps rC02, rA0, rB2, rC02
2102
#if KB == 220 && defined(BETA0)
2104
vfmaddps rC12, rA1, rB2, rC12
2106
movaps 752(pB0,ldb,2), rB2
2107
#elif KB == 220 && defined(BETA1)
2109
vfmaddps rC03, rA0, rB3, rC03
2110
#if KB == 220 && defined(BETA1)
2112
vfmaddps rC13, rA1, rB3, rC13
2114
movaps 752(pB0,ldb3), rB3
2115
#elif KB == 220 && defined(BETA0)
2119
vfmaddps rC00, ra0, rB0, rC00
2121
movaps 768(pA0), rA0
2123
vfmaddps rC10, ra1, rB0, rC10
2125
movaps 768(pA0,lda), rA1
2127
vfmaddps rC01, ra0, rB1, rC01
2129
movaps 768(pB0), rB0
2130
#elif KB == 224 && defined(BETA1)
2132
vfmaddps rC11, ra1, rB1, rC11
2134
movaps 768(pB0,ldb), rB1
2135
#elif KB == 224 && defined(BETA1)
2137
vfmaddps rC02, ra0, rB2, rC02
2138
#if KB == 224 && defined(BETA1)
2140
vfmaddps rC12, ra1, rB2, rC12
2142
movaps 768(pB0,ldb,2), rB2
2143
#elif KB == 224 && defined(BETA1)
2145
vfmaddps rC03, ra0, rB3, rC03
2146
#if KB == 224 && defined(BETA1)
2148
vfmaddps rC13, ra1, rB3, rC13
2150
movaps 768(pB0,ldb3), rB3
2151
#elif KB == 224 && defined(BETA1)
2155
vfmaddps rC00, rA0, rB0, rC00
2157
movaps 784(pA0), ra0
2159
vfmaddps rC10, rA1, rB0, rC10
2161
movaps 784(pA0,lda), ra1
2163
vfmaddps rC01, rA0, rB1, rC01
2165
movaps 784(pB0), rB0
2166
#elif KB == 228 && defined(BETA1)
2168
vfmaddps rC11, rA1, rB1, rC11
2170
movaps 784(pB0,ldb), rB1
2171
#elif KB == 228 && defined(BETA1)
2173
vfmaddps rC02, rA0, rB2, rC02
2174
#if KB == 228 && defined(BETA0)
2176
vfmaddps rC12, rA1, rB2, rC12
2178
movaps 784(pB0,ldb,2), rB2
2179
#elif KB == 228 && defined(BETA1)
2181
vfmaddps rC03, rA0, rB3, rC03
2182
#if KB == 228 && defined(BETA1)
2184
vfmaddps rC13, rA1, rB3, rC13
2186
movaps 784(pB0,ldb3), rB3
2187
#elif KB == 228 && defined(BETA0)
2191
vfmaddps rC00, ra0, rB0, rC00
2193
movaps 800(pA0), rA0
2195
vfmaddps rC10, ra1, rB0, rC10
2197
movaps 800(pA0,lda), rA1
2199
vfmaddps rC01, ra0, rB1, rC01
2201
movaps 800(pB0), rB0
2202
#elif KB == 232 && defined(BETA1)
2204
vfmaddps rC11, ra1, rB1, rC11
2206
movaps 800(pB0,ldb), rB1
2207
#elif KB == 232 && defined(BETA1)
2209
vfmaddps rC02, ra0, rB2, rC02
2210
#if KB == 232 && defined(BETA1)
2212
vfmaddps rC12, ra1, rB2, rC12
2214
movaps 800(pB0,ldb,2), rB2
2215
#elif KB == 232 && defined(BETA1)
2217
vfmaddps rC03, ra0, rB3, rC03
2218
#if KB == 232 && defined(BETA1)
2220
vfmaddps rC13, ra1, rB3, rC13
2222
movaps 800(pB0,ldb3), rB3
2223
#elif KB == 232 && defined(BETA1)
2227
vfmaddps rC00, rA0, rB0, rC00
2229
movaps 816(pA0), ra0
2231
vfmaddps rC10, rA1, rB0, rC10
2233
movaps 816(pA0,lda), ra1
2235
vfmaddps rC01, rA0, rB1, rC01
2237
movaps 816(pB0), rB0
2238
#elif KB == 236 && defined(BETA1)
2240
vfmaddps rC11, rA1, rB1, rC11
2242
movaps 816(pB0,ldb), rB1
2243
#elif KB == 236 && defined(BETA1)
2245
vfmaddps rC02, rA0, rB2, rC02
2246
#if KB == 236 && defined(BETA0)
2248
vfmaddps rC12, rA1, rB2, rC12
2250
movaps 816(pB0,ldb,2), rB2
2251
#elif KB == 236 && defined(BETA1)
2253
vfmaddps rC03, rA0, rB3, rC03
2254
#if KB == 236 && defined(BETA1)
2256
vfmaddps rC13, rA1, rB3, rC13
2258
movaps 816(pB0,ldb3), rB3
2259
#elif KB == 236 && defined(BETA0)
2263
vfmaddps rC00, ra0, rB0, rC00
2265
movaps 832(pA0), rA0
2267
vfmaddps rC10, ra1, rB0, rC10
2269
movaps 832(pA0,lda), rA1
2271
vfmaddps rC01, ra0, rB1, rC01
2273
movaps 832(pB0), rB0
2274
#elif KB == 240 && defined(BETA1)
2276
vfmaddps rC11, ra1, rB1, rC11
2278
movaps 832(pB0,ldb), rB1
2279
#elif KB == 240 && defined(BETA1)
2281
vfmaddps rC02, ra0, rB2, rC02
2282
#if KB == 240 && defined(BETA1)
2284
vfmaddps rC12, ra1, rB2, rC12
2286
movaps 832(pB0,ldb,2), rB2
2287
#elif KB == 240 && defined(BETA1)
2289
vfmaddps rC03, ra0, rB3, rC03
2290
#if KB == 240 && defined(BETA1)
2292
vfmaddps rC13, ra1, rB3, rC13
2294
movaps 832(pB0,ldb3), rB3
2295
#elif KB == 240 && defined(BETA1)
2299
vfmaddps rC00, rA0, rB0, rC00
2301
movaps 848(pA0), ra0
2303
vfmaddps rC10, rA1, rB0, rC10
2305
movaps 848(pA0,lda), ra1
2307
vfmaddps rC01, rA0, rB1, rC01
2309
movaps 848(pB0), rB0
2310
#elif KB == 244 && defined(BETA1)
2312
vfmaddps rC11, rA1, rB1, rC11
2314
movaps 848(pB0,ldb), rB1
2315
#elif KB == 244 && defined(BETA1)
2317
vfmaddps rC02, rA0, rB2, rC02
2318
#if KB == 244 && defined(BETA0)
2320
vfmaddps rC12, rA1, rB2, rC12
2322
movaps 848(pB0,ldb,2), rB2
2323
#elif KB == 244 && defined(BETA1)
2325
vfmaddps rC03, rA0, rB3, rC03
2326
#if KB == 244 && defined(BETA1)
2328
vfmaddps rC13, rA1, rB3, rC13
2330
movaps 848(pB0,ldb3), rB3
2331
#elif KB == 244 && defined(BETA0)
2335
vfmaddps rC00, ra0, rB0, rC00
2337
movaps 864(pA0), rA0
2339
vfmaddps rC10, ra1, rB0, rC10
2341
movaps 864(pA0,lda), rA1
2343
vfmaddps rC01, ra0, rB1, rC01
2345
movaps 864(pB0), rB0
2346
#elif KB == 248 && defined(BETA1)
2348
vfmaddps rC11, ra1, rB1, rC11
2350
movaps 864(pB0,ldb), rB1
2351
#elif KB == 248 && defined(BETA1)
2353
vfmaddps rC02, ra0, rB2, rC02
2354
#if KB == 248 && defined(BETA1)
2356
vfmaddps rC12, ra1, rB2, rC12
2358
movaps 864(pB0,ldb,2), rB2
2359
#elif KB == 248 && defined(BETA1)
2361
vfmaddps rC03, ra0, rB3, rC03
2362
#if KB == 248 && defined(BETA1)
2364
vfmaddps rC13, ra1, rB3, rC13
2366
movaps 864(pB0,ldb3), rB3
2367
#elif KB == 248 && defined(BETA1)
2371
vfmaddps rC00, rA0, rB0, rC00
2373
movaps 880(pA0), ra0
2375
vfmaddps rC10, rA1, rB0, rC10
2377
movaps 880(pA0,lda), ra1
2379
vfmaddps rC01, rA0, rB1, rC01
2381
movaps 880(pB0), rB0
2382
#elif KB == 252 && defined(BETA1)
2384
vfmaddps rC11, rA1, rB1, rC11
2386
movaps 880(pB0,ldb), rB1
2387
#elif KB == 252 && defined(BETA1)
2389
vfmaddps rC02, rA0, rB2, rC02
2390
#if KB == 252 && defined(BETA0)
2392
vfmaddps rC12, rA1, rB2, rC12
2394
movaps 880(pB0,ldb,2), rB2
2395
#elif KB == 252 && defined(BETA1)
2397
vfmaddps rC03, rA0, rB3, rC03
2398
#if KB == 252 && defined(BETA1)
2400
vfmaddps rC13, rA1, rB3, rC13
2402
movaps 880(pB0,ldb3), rB3
2403
#elif KB == 252 && defined(BETA0)
2407
vfmaddps rC00, ra0, rB0, rC00
2409
movaps 896(pA0), rA0
2411
vfmaddps rC10, ra1, rB0, rC10
2413
movaps 896(pA0,lda), rA1
2415
vfmaddps rC01, ra0, rB1, rC01
2417
movaps 896(pB0), rB0
2418
#elif KB == 256 && defined(BETA1)
2420
vfmaddps rC11, ra1, rB1, rC11
2422
movaps 896(pB0,ldb), rB1
2423
#elif KB == 256 && defined(BETA1)
2425
vfmaddps rC02, ra0, rB2, rC02
2426
#if KB == 256 && defined(BETA1)
2428
vfmaddps rC12, ra1, rB2, rC12
2430
movaps 896(pB0,ldb,2), rB2
2431
#elif KB == 256 && defined(BETA1)
2433
vfmaddps rC03, ra0, rB3, rC03
2434
#if KB == 256 && defined(BETA1)
2436
vfmaddps rC13, ra1, rB3, rC13
2438
movaps 896(pB0,ldb3), rB3
2439
#elif KB == 256 && defined(BETA1)
2443
* Add in original C if necessary
2447
addss CMUL(4)(pC0), rC10
2448
addss (pC0,ldc), rC01
2449
addss CMUL(4)(pC0,ldc), rC11
2450
addss (pC0,ldc,2), rC02
2451
addss CMUL(4)(pC0,ldc,2), rC12
2452
addss (pC0,ldc3), rC03
2453
addss CMUL(4)(pC0,ldc3), rC13
2454
#elif defined(BETAX)
2455
movaps BETAOFF(%rsp), rA0
2457
vfmaddss ra0, (pC0), rA0, rB0
2458
vfmaddss ra0, CMUL(4)(pC0), rA0, rB1
2459
unpcklps rB1, rB0 /* rB0={0,0,c10,c00} */
2460
vfmaddss ra0, (pC0,ldc), rA0, rB2
2461
vfmaddss ra0, CMUL(4)(pC0,ldc), rA0, rB3
2462
unpcklps rB3, rB2 /* rB2={0,0,c11,c01} */
2463
movlhps rB2, rB0 /* rB0={c11,c01,c10,c00} */
2464
vfmaddss ra0, (pC0,ldc,2), rA0, rB1
2465
vfmaddss ra0, CMUL(4)(pC0,ldc,2), rA0, rB3
2466
unpcklps rB3, rB1 /* rB1={0,0,c12,c02} */
2467
vfmaddss ra0, (pC0,ldc3), rA0, rA1
2468
vfmaddss ra0, CMUL(4)(pC0,ldc3), rA0, ra1
2469
unpcklps ra1, rA1 /* rA1={0,0,c13,c03} */
2470
movlhps rA1, rB1 /* rB1={c13,c03,c12,c02} */
2473
* K-loop finished, sum up vectors
2476
haddps rC10, rC00 /* rC00 = {c10cd, c10ab, c00cd, c00ab} */
2478
haddps rC11, rC01 /* rC01 = {c11cd, c11ab, c01cd, c01ab} */
2479
haddps rC12, rC02 /* rC02 = {c12cd, c12ab, c02cd, c02ab} */
2480
haddps rC13, rC03 /* rC03 = {c13cd, c13ab, c03cd, c03ab} */
2481
haddps rC01, rC00 /* rC00 = {c11a-d, c01a-d, c10a-d, c00a-d} */
2482
haddps rC03, rC02 /* rC02 = {c13a-d, c03a-d, c12a-d, c02a-d} */
2488
/* rC00 = {c11, c01, c10, c00} */
2489
/* rC02 = {c13, c03, c12, c02} */
2491
pshufd $0xB1, rC00, rC01 /* rC01 = {C01, c11, c00, c10} */
2492
movss rC02, (pC0,ldc,2)
2493
pshufd $0xB1, rC02, rC03 /* rC03 = {C03, c13, c02, c12} */
2494
movss rC01, CMUL(4)(pC0)
2495
movhlps rC00, rC00 /* rC00 = {c11, c01, c11, c01} */
2496
movss rC03, CMUL(4)(pC0,ldc,2)
2497
movhlps rC02, rC02 /* rC02 = {c13, c03, c13, c03} */
2498
movss rC00, (pC0,ldc)
2499
movhlps rC01, rC01 /* rC01 = {c01, c11, c01, c11} */
2500
movss rC02, (pC0,ldc3)
2501
movhlps rC03, rC03 /* rC03 = {c03, c13, c03, c13} */
2502
movss rC01, CMUL(4)(pC0,ldc)
2503
movss rC03, CMUL(4)(pC0,ldc3)
2505
lea (pA0,lda,2), pA0
2514
lea (pB0, ldb, 4), pB0
2520
movq -16(%rsp), %rbx
2521
movq -24(%rsp), %r12
2522
movq -32(%rsp), %r13
2523
movq -40(%rsp), %r14
2524
movq -48(%rsp), %r15