1
#include "atlas_misc.h"
3
static void ATL_dJIK0x0x0TN1x1x1_a1_b1
4
(const int M, const int N, const int K, const TYPE alpha, const TYPE *A, const int lda, const TYPE *B, const int ldb, const TYPE beta, TYPE *C, const int ldc)
6
* matmul with TA=T, TB=N, MB=0, NB=0, KB=0,
7
* lda=0, ldb=0, ldc=0, mu=1, nu=1, ku=1
13
const TYPE *stM = A + (lda*Mb);
14
const TYPE *stN = B + (ldb*Nb);
16
const int incAm = ((lda) - Kb), incAn = -(Mb*lda);
18
const int incBm = -(Kb), incBn = (ldb);
21
const int incCn = (ldc) - (Mb);
24
const int incCn = ((ldc) - (Mb))<<1;
44
for (k=K; k; k--) /* easy loop to unroll */
100
static void ATL_dJIK0x0x0TN6x1x1_a1_b1
101
(const int M, const int N, const int K, const TYPE alpha, const TYPE *A, const int lda, const TYPE *B, const int ldb, const TYPE beta, TYPE *C, const int ldc)
103
* matmul with TA=T, TB=N, MB=0, NB=0, KB=0,
104
* lda=0, ldb=0, ldc=0, mu=6, nu=1, ku=1
107
const int Mb = (M/6)*6;
110
const TYPE *ca=A, *cb=B;
112
const TYPE *stM = A + (lda*Mb);
113
const TYPE *stN = B + (ldb*Nb);
115
const int incAm = ((((lda) << 2)+((lda) << 1)) - Kb), incAn = -(Mb*lda);
117
const int incBm = -(Kb), incBn = (ldb);
120
const int incCn = (ldc) - (Mb);
123
const int incCn = ((ldc) - (Mb))<<1;
126
const TYPE *pA0=A, *pA1=pA0+(lda), *pA2=pA1+(lda), *pA3=pA2+(lda), *pA4=pA3+(lda), *pA5=pA4+(lda);
129
register TYPE rA0, rA1, rA2, rA3, rA4, rA5;
131
register TYPE rC0_0, rC1_0, rC2_0, rC3_0, rC4_0, rC5_0;
139
rC0_0 = rC1_0 = rC2_0 = rC3_0 = rC4_0 = rC5_0 = ATL_rzero;
143
rC1_0 = pC0[1]; rC2_0 = pC0[2]; rC3_0 = pC0[3];
144
rC4_0 = pC0[4]; rC5_0 = pC0[5];
146
rC1_0 = pC0[2]; rC2_0 = pC0[4]; rC3_0 = pC0[6];
147
rC4_0 = pC0[8]; rC5_0 = pC0[10];
151
rC0_0 *= rA5; rC1_0 *= rA5; rC2_0 *= rA5;
152
rC3_0 *= rA5; rC4_0 *= rA5; rC5_0 *= rA5;
155
for (k=K; k; k--) /* easy loop to unroll */
180
pC0[1] = rC1_0; pC0[2] = rC2_0; pC0[3] = rC3_0;
181
pC0[4] = rC4_0; pC0[5] = rC5_0;
183
pC0[2] = rC1_0; pC0[4] = rC2_0; pC0[6] = rC3_0;
184
pC0[8] = rC4_0; pC0[10] = rC5_0;
208
ATL_dJIK0x0x0TN1x1x1_a1_b1(k, N, K, alpha, ca + (Mb*lda), lda, cb, ldb, beta, cc + (Mb SHIFT), ldc);
246
static void ATL_dJIK0x0x0TN1x8x1_a1_b1
247
(const int M, const int N, const int K, const TYPE alpha, const TYPE *A, const int lda, const TYPE *B, const int ldb, const TYPE beta, TYPE *C, const int ldc)
249
* matmul with TA=T, TB=N, MB=0, NB=0, KB=0,
250
* lda=0, ldb=0, ldc=0, mu=1, nu=8, ku=1
254
const int Nb = (N>>3)<<3;
256
const TYPE *ca=A, *cb=B;
258
const TYPE *stM = A + (lda*Mb);
259
const TYPE *stN = B + (ldb*Nb);
261
const int incAm = ((lda) - Kb), incAn = -(Mb*lda);
263
const int incBm = -(Kb), incBn = (((ldb) << 3));
267
const int incCn = (((ldc) << 3)) - (Mb);
270
const int incCn = ((((ldc) << 3)) - (Mb))<<1, ldc2 = ldc<<1;
272
TYPE *pC0=C, *pC1=pC0+(ldc2), *pC2=pC1+(ldc2), *pC3=pC2+(ldc2),
273
*pC4=pC3+(ldc2), *pC5=pC4+(ldc2), *pC6=pC5+(ldc2), *pC7=pC6+(ldc2);
275
const TYPE *pB0=B, *pB1=pB0+(ldb), *pB2=pB1+(ldb), *pB3=pB2+(ldb), *pB4=pB3+(ldb), *pB5=pB4+(ldb), *pB6=pB5+(ldb), *pB7=pB6+(ldb);
278
register TYPE rB0, rB1, rB2, rB3, rB4, rB5, rB6, rB7;
279
register TYPE rC0_0, rC0_1, rC0_2, rC0_3, rC0_4, rC0_5, rC0_6, rC0_7;
287
rC0_0 = rC0_1 = rC0_2 = rC0_3 = rC0_4 = rC0_5 =
288
rC0_6 = rC0_7 = ATL_rzero;
290
rC0_0 = *pC0; rC0_1 = *pC1; rC0_2 = *pC2;
291
rC0_3 = *pC3; rC0_4 = *pC4; rC0_5 = *pC5;
292
rC0_6 = *pC6; rC0_7 = *pC7;
295
rC0_0 *= rB7; rC0_1 *= rB7; rC0_2 *= rB7; rC0_3 *= rB7;
296
rC0_4 *= rB7; rC0_5 *= rB7; rC0_6 *= rB7; rC0_7 *= rB7;
299
for (k=K; k; k--) /* easy loop to unroll */
376
ATL_dJIK0x0x0TN1x1x1_a1_b1(M, k, K, alpha, ca, lda, cb + (Nb*ldb), ldb, beta, cc + (Nb*ldc2), ldc);
418
(const int M, const int N, const int K, const TYPE alpha, const TYPE *A, const int lda, const TYPE *B, const int ldb, const TYPE beta, TYPE *C, const int ldc)
420
* matmul with TA=T, TB=N, MB=0, NB=0, KB=0,
421
* lda=0, ldb=0, ldc=0, mu=6, nu=8, ku=2
424
const int Mb = (M/6)*6;
425
const int Nb = (N>>3)<<3;
426
const int Kb = (K>>1)<<1;
428
const int Kstart = (K-1)>>1;
430
const int Kstart = (K>>1) - 1;
432
const TYPE *stM = A + (lda*Mb);
433
const TYPE *stN = B + (ldb*Nb), *ca=A, *cb=B;
436
const int incAm = ((((lda) << 2)+((lda) << 1)) - Kb), incAn = -(Mb*lda);
438
const int incBm = -(Kb), incBn = (((ldb) << 3));
442
const int incCn = (((ldc) << 3)) - (Mb);
445
const int incCn = ((((ldc) << 3)) - (Mb))<<1, ldc2=ldc<<1;
447
TYPE *pC0=C, *pC1=pC0+(ldc2), *pC2=pC1+(ldc2), *pC3=pC2+(ldc2),
448
*pC4=pC3+(ldc2), *pC5=pC4+(ldc2), *pC6=pC5+(ldc2), *pC7=pC6+(ldc2);
449
const TYPE *pA0=A, *pA1=pA0+(lda), *pA2=pA1+(lda), *pA3=pA2+(lda),
450
*pA4=pA3+(lda), *pA5=pA4+(lda);
451
const TYPE *pB0=B, *pB1=pB0+(ldb), *pB2=pB1+(ldb), *pB3=pB2+(ldb),
452
*pB4=pB3+(ldb), *pB5=pB4+(ldb), *pB6=pB5+(ldb), *pB7=pB6+(ldb);
455
TYPE *bp = (TYPE *) β
457
register TYPE rA0, rA1, rA2, rA3, rA4, rA5;
458
register TYPE ra0, ra1, ra2, ra3, ra4, ra5;
459
register TYPE rB0, rB1, rB2, rB3, rB4, rB5, rB6, rB7;
460
register TYPE rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7;
461
register TYPE rC0_0, rC1_0, rC2_0, rC3_0, rC4_0, rC5_0,
462
rC0_1, rC1_1, rC2_1, rC3_1, rC4_1, rC5_1,
463
rC0_2, rC1_2, rC2_2, rC3_2, rC4_2, rC5_2,
464
rC0_3, rC1_3, rC2_3, rC3_3, rC4_3, rC5_3,
465
rC0_4, rC1_4, rC2_4, rC3_4, rC4_4, rC5_4,
466
rC0_5, rC1_5, rC2_5, rC3_5, rC4_5, rC5_5,
467
rC0_6, rC1_6, rC2_6, rC3_6, rC4_6, rC5_6,
468
rC0_7, rC1_7, rC2_7, rC3_7, rC4_7, rC5_7;
469
if (pA0 != stM && pB0 != stN)
476
rC0_0 = rC1_0 = rC2_0 = rC3_0 = rC4_0 = rC5_0 =
477
rC0_1 = rC1_1 = rC2_1 = rC3_1 = rC4_1 = rC5_1 =
478
rC0_2 = rC1_2 = rC2_2 = rC3_2 = rC4_2 = rC5_2 =
479
rC0_3 = rC1_3 = rC2_3 = rC3_3 = rC4_3 = rC5_3 =
480
rC0_4 = rC1_4 = rC2_4 = rC3_4 = rC4_4 = rC5_4 =
481
rC0_5 = rC1_5 = rC2_5 = rC3_5 = rC4_5 = rC5_5 =
482
rC0_6 = rC1_6 = rC2_6 = rC3_6 = rC4_6 = rC5_6 =
483
rC0_7 = rC1_7 = rC2_7 = rC3_7 = rC4_7 = rC5_7 = ATL_rzero;
486
rC0_0 = *pC0; rC1_0 = pC0[1]; rC2_0 = pC0[ 2];
487
rC3_0 = pC0[3]; rC4_0 = pC0[4]; rC5_0 = pC0[ 5];
488
rC0_1 = *pC1; rC1_1 = pC1[1]; rC2_1 = pC1[ 2];
489
rC3_1 = pC1[3]; rC4_1 = pC1[4]; rC5_1 = pC1[ 5];
490
rC0_2 = *pC2; rC1_2 = pC2[1]; rC2_2 = pC2[ 2];
491
rC3_2 = pC2[3]; rC4_2 = pC2[4]; rC5_2 = pC2[ 5];
492
rC0_3 = *pC3; rC1_3 = pC3[1]; rC2_3 = pC3[ 2];
493
rC3_3 = pC3[3]; rC4_3 = pC3[4]; rC5_3 = pC3[ 5];
494
rC0_4 = *pC4; rC1_4 = pC4[1]; rC2_4 = pC4[ 2];
495
rC3_4 = pC4[3]; rC4_4 = pC4[4]; rC5_4 = pC4[ 5];
496
rC0_5 = *pC5; rC1_5 = pC5[1]; rC2_5 = pC5[ 2];
497
rC3_5 = pC5[3]; rC4_5 = pC5[4]; rC5_5 = pC5[ 5];
498
rC0_6 = *pC6; rC1_6 = pC6[1]; rC2_6 = pC6[ 2];
499
rC3_6 = pC6[3]; rC4_6 = pC6[4]; rC5_6 = pC6[ 5];
500
rC0_7 = *pC7; rC1_7 = pC7[1]; rC2_7 = pC7[ 2];
501
rC3_7 = pC7[3]; rC4_7 = pC7[4]; rC5_7 = pC7[ 5];
503
rC0_0 = *pC0; rC1_0 = pC0[2]; rC2_0 = pC0[ 4];
504
rC3_0 = pC0[6]; rC4_0 = pC0[8]; rC5_0 = pC0[10];
505
rC0_1 = *pC1; rC1_1 = pC1[2]; rC2_1 = pC1[ 4];
506
rC3_1 = pC1[6]; rC4_1 = pC1[8]; rC5_1 = pC1[10];
507
rC0_2 = *pC2; rC1_2 = pC2[2]; rC2_2 = pC2[ 4];
508
rC3_2 = pC2[6]; rC4_2 = pC2[8]; rC5_2 = pC2[10];
509
rC0_3 = *pC3; rC1_3 = pC3[2]; rC2_3 = pC3[ 4];
510
rC3_3 = pC3[6]; rC4_3 = pC3[8]; rC5_3 = pC3[10];
511
rC0_4 = *pC4; rC1_4 = pC4[2]; rC2_4 = pC4[ 4];
512
rC3_4 = pC4[6]; rC4_4 = pC4[8]; rC5_4 = pC4[10];
513
rC0_5 = *pC5; rC1_5 = pC5[2]; rC2_5 = pC5[ 4];
514
rC3_5 = pC5[6]; rC4_5 = pC5[8]; rC5_5 = pC5[10];
515
rC0_6 = *pC6; rC1_6 = pC6[2]; rC2_6 = pC6[ 4];
516
rC3_6 = pC6[6]; rC4_6 = pC6[8]; rC5_6 = pC6[10];
517
rC0_7 = *pC7; rC1_7 = pC7[2]; rC2_7 = pC7[ 4];
518
rC3_7 = pC7[6]; rC4_7 = pC7[8]; rC5_7 = pC7[10];
522
rC0_0 *= rb7; rC1_0 *= rb7; rC2_0 *= rb7;
523
rC3_0 *= rb7; rC4_0 *= rb7; rC5_0 *= rb7;
524
rC0_1 *= rb7; rC1_1 *= rb7; rC2_1 *= rb7;
525
rC3_1 *= rb7; rC4_1 *= rb7; rC5_1 *= rb7;
526
rC0_2 *= rb7; rC1_2 *= rb7; rC2_2 *= rb7;
527
rC3_2 *= rb7; rC4_2 *= rb7; rC5_2 *= rb7;
528
rC0_3 *= rb7; rC1_3 *= rb7; rC2_3 *= rb7;
529
rC3_3 *= rb7; rC4_3 *= rb7; rC5_3 *= rb7;
530
rC0_4 *= rb7; rC1_4 *= rb7; rC2_4 *= rb7;
531
rC3_4 *= rb7; rC4_4 *= rb7; rC5_4 *= rb7;
532
rC0_5 *= rb7; rC1_5 *= rb7; rC2_5 *= rb7;
533
rC3_5 *= rb7; rC4_5 *= rb7; rC5_5 *= rb7;
534
rC0_6 *= rb7; rC1_6 *= rb7; rC2_6 *= rb7;
535
rC3_6 *= rb7; rC4_6 *= rb7; rC5_6 *= rb7;
536
rC0_7 *= rb7; rC1_7 *= rb7; rC2_7 *= rb7;
537
rC3_7 *= rb7; rC4_7 *= rb7; rC5_7 *= rb7;
540
rA0 = *pA0++; rA1 = *pA1++; rA2 = *pA2++; rA3 = *pA3++;
541
rA4 = *pA4++; rA5 = *pA5++;
542
rB0 = *pB0++; rB1 = *pB1++; rB2 = *pB2++; rB3 = *pB3++;
543
rB4 = *pB4++; rB5 = *pB5++; rB6 = *pB6++; rB7 = *pB7++;
544
for (k=Kstart; k; k--) /* easy loop to unroll */
550
rC4_0 += rA4 * rB0; rb0 = *pB0++;
553
rC1_1 += rA1 * rB1; ra0 = *pA0++;
556
rC4_1 += rA4 * rB1; ra1 = *pA1++;
559
rC1_2 += rA1 * rB2; ra2 = *pA2++;
562
rC4_2 += rA4 * rB2; ra3 = *pA3++;
565
rC1_3 += rA1 * rB3; ra4 = *pA4++;
568
rC4_3 += rA4 * rB3; ra5 = *pA5++;
571
rC1_4 += rA1 * rB4; rb1 = *pB1++;
574
rC4_4 += rA4 * rB4; rb2 = *pB2++;
577
rC1_5 += rA1 * rB5; rb3 = *pB3++;
580
rC4_5 += rA4 * rB5; rb4 = *pB4++;
583
rC1_6 += rA1 * rB6; rb5 = *pB5++;
586
rC4_6 += rA4 * rB6; rb6 = *pB6++;
589
rC1_7 += rA1 * rB7; rb7 = *pB7++;
592
rC4_7 += rA4 * rB7; rB0 = *pB0++;
596
rC1_0 += ra1 * rb0; rA0 = *pA0++;
599
rC4_0 += ra4 * rb0; rA1 = *pA1++;
602
rC1_1 += ra1 * rb1; rA2 = *pA2++;
605
rC4_1 += ra4 * rb1; rA3 = *pA3++;
608
rC1_2 += ra1 * rb2; rA4 = *pA4++;
611
rC4_2 += ra4 * rb2; rA5 = *pA5++;
614
rC1_3 += ra1 * rb3; rB1 = *pB1++;
617
rC4_3 += ra4 * rb3; rB2 = *pB2++;
620
rC1_4 += ra1 * rb4; rB3 = *pB3++;
623
rC4_4 += ra4 * rb4; rB4 = *pB4++;
626
rC1_5 += ra1 * rb5; rB5 = *pB5++;
629
rC4_5 += ra4 * rb5; rB6 = *pB6++;
633
rC2_6 += ra2 * rb6; rB7 = *pB7++;
649
rC1_0 += rA1 * rB0; pA0--;
651
rC3_0 += rA3 * rB0; pA1--;
653
rC5_0 += rA5 * rB0; pA2--;
655
rC1_1 += rA1 * rB1; pA3--;
657
rC3_1 += rA3 * rB1; pA4--;
659
rC5_1 += rA5 * rB1; pA5--;
661
rC1_2 += rA1 * rB2; pB0--;
663
rC3_2 += rA3 * rB2; pB1--;
665
rC5_2 += rA5 * rB2; pB2--;
667
rC1_3 += rA1 * rB3; pB3--;
669
rC3_3 += rA3 * rB3; pB4--;
671
rC5_3 += rA5 * rB3; pB5--;
673
rC1_4 += rA1 * rB4; pB6--;
675
rC3_4 += rA3 * rB4; pB7--;
704
rC4_0 += rA4 * rB0; rb0 = *pB0++;
707
rC1_1 += rA1 * rB1; ra0 = *pA0++;
710
rC4_1 += rA4 * rB1; ra1 = *pA1++;
713
rC1_2 += rA1 * rB2; ra2 = *pA2++;
716
rC4_2 += rA4 * rB2; ra3 = *pA3++;
719
rC1_3 += rA1 * rB3; ra4 = *pA4++;
722
rC4_3 += rA4 * rB3; ra5 = *pA5++;
725
rC1_4 += rA1 * rB4; rb1 = *pB1++;
728
rC4_4 += rA4 * rB4; rb2 = *pB2++;
731
rC1_5 += rA1 * rB5; rb3 = *pB3++;
734
rC4_5 += rA4 * rB5; rb4 = *pB4++;
737
rC1_6 += rA1 * rB6; rb5 = *pB5++;
740
rC4_6 += rA4 * rB6; rb6 = *pB6++;
743
rC1_7 += rA1 * rB7; rb7 = *pB7++;
802
*pC0 = rC0_0; pC0[1] = rC1_0; pC0[ 2] = rC2_0;
803
pC0[3] = rC3_0; pC0[4] = rC4_0; pC0[ 5] = rC5_0;
804
*pC1 = rC0_1; pC1[1] = rC1_1; pC1[ 2] = rC2_1;
805
pC1[3] = rC3_1; pC1[4] = rC4_1; pC1[ 5] = rC5_1;
806
*pC2 = rC0_2; pC2[1] = rC1_2; pC2[ 2] = rC2_2;
807
pC2[3] = rC3_2; pC2[4] = rC4_2; pC2[ 5] = rC5_2;
808
*pC3 = rC0_3; pC3[1] = rC1_3; pC3[ 2] = rC2_3;
809
pC3[3] = rC3_3; pC3[4] = rC4_3; pC3[ 5] = rC5_3;
810
*pC4 = rC0_4; pC4[1] = rC1_4; pC4[ 2] = rC2_4;
811
pC4[3] = rC3_4; pC4[4] = rC4_4; pC4[ 5] = rC5_4;
812
*pC5 = rC0_5; pC5[1] = rC1_5; pC5[ 2] = rC2_5;
813
pC5[3] = rC3_5; pC5[4] = rC4_5; pC5[ 5] = rC5_5;
814
*pC6 = rC0_6; pC6[1] = rC1_6; pC6[ 2] = rC2_6;
815
pC6[3] = rC3_6; pC6[4] = rC4_6; pC6[ 5] = rC5_6;
816
*pC7 = rC0_7; pC7[1] = rC1_7; pC7[ 2] = rC2_7;
817
pC7[3] = rC3_7; pC7[4] = rC4_7; pC7[ 5] = rC5_7;
819
*pC0 = rC0_0; pC0[2] = rC1_0; pC0[ 4] = rC2_0;
820
pC0[6] = rC3_0; pC0[8] = rC4_0; pC0[10] = rC5_0;
821
*pC1 = rC0_1; pC1[2] = rC1_1; pC1[ 4] = rC2_1;
822
pC1[6] = rC3_1; pC1[8] = rC4_1; pC1[10] = rC5_1;
823
*pC2 = rC0_2; pC2[2] = rC1_2; pC2[ 4] = rC2_2;
824
pC2[6] = rC3_2; pC2[8] = rC4_2; pC2[10] = rC5_2;
825
*pC3 = rC0_3; pC3[2] = rC1_3; pC3[ 4] = rC2_3;
826
pC3[6] = rC3_3; pC3[8] = rC4_3; pC3[10] = rC5_3;
827
*pC4 = rC0_4; pC4[2] = rC1_4; pC4[ 4] = rC2_4;
828
pC4[6] = rC3_4; pC4[8] = rC4_4; pC4[10] = rC5_4;
829
*pC5 = rC0_5; pC5[2] = rC1_5; pC5[ 4] = rC2_5;
830
pC5[6] = rC3_5; pC5[8] = rC4_5; pC5[10] = rC5_5;
831
*pC6 = rC0_6; pC6[2] = rC1_6; pC6[ 4] = rC2_6;
832
pC6[6] = rC3_6; pC6[8] = rC4_6; pC6[10] = rC5_6;
833
*pC7 = rC0_7; pC7[2] = rC1_7; pC7[ 4] = rC2_7;
834
pC7[6] = rC3_7; pC7[8] = rC4_7; pC7[10] = rC5_7;
836
pC0 += incCm; pC1 += incCm; pC2 += incCm; pC3 += incCm;
837
pC4 += incCm; pC5 += incCm; pC6 += incCm; pC7 += incCm;
838
pA0 += incAm; pA1 += incAm; pA2 += incAm; pA3 += incAm;
839
pA4 += incAm; pA5 += incAm;
840
pB0 += incBm; pB1 += incBm; pB2 += incBm; pB3 += incBm;
841
pB4 += incBm; pB5 += incBm; pB6 += incBm; pB7 += incBm;
870
ATL_dJIK0x0x0TN6x1x1_a1_b1(M, k, K, alpha, ca, lda, cb + (Nb*ldb), ldb,
871
beta, cc + (Nb*ldc2), ldc);
873
ATL_dJIK0x0x0TN1x8x1_a1_b1(k, Nb, K, alpha, ca + (Mb*lda), lda, cb, ldb, beta, cc + (Mb SHIFT), ldc);