2
* Automatically Tuned Linear Algebra Software v3.10.1
3
* Copyright (C) 2011 Md. Majedul Haque Sujon
5
* Redistribution and use in source and binary forms, with or without
6
* modification, are permitted provided that the following conditions
8
* 1. Redistributions of source code must retain the above copyright
9
* notice, this list of conditions and the following disclaimer.
10
* 2. Redistributions in binary form must reproduce the above copyright
11
* notice, this list of conditions, and the following disclaimer in the
12
* documentation and/or other materials provided with the distribution.
13
* 3. The name of the ATLAS group or the names of its contributers may
14
* not be used to endorse or promote products derived from this
15
* software without specific written permission.
17
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
19
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS
21
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27
* POSSIBILITY OF SUCH DAMAGE.
31
#error "This routine requires GAS/ARM assembly"
34
#error "This routine requires an ARM NEON SIMD unit!"
37
#error "This NEON routine requires turning off IEEE compliance!"
41
/* Written and Submitted by Md Majedul Haque Sujon */
43
/* Unroll and Scalar expansion:
44
* I have tried following unroll factors (MxN): 4x2, 4x4, 8x2, 8x4, 8x6
45
* I found better performance by using 8x4 (better than 8x6). In case of
46
* 8x6, I need to reuse fp registers which reduce the scalar expansion as
47
* well as increase the data dependency.
50
* I have tried prefetch with offset 128, 64, 32, 16. Surprisingly, I
51
* found little variation in performance with the parameter.
54
* Splitting clean up into 2 cases:
55
* I have splitted into two separate cases" TALL and FAT. I provided
56
* separate implementations for each case.
58
* --------------------------------
64
* |----------------------- |
67
* --------------------------------
69
* For cleanup, I first called TALL case and then FAT case which I beleive
70
* provide better performance. When lda is not very larger than M,prefetch
74
* For remaining N<4, TALL case will execute. To provie better
75
* performance, I want to use maximum the vector operations as much as
76
* possible. So, I implement 3 blocks: N==3, N==2 and N==1. Each block
77
* uses M unrolled by 8 (remaining M%8 elements will fall into scalar loop)
80
* For remaining M < 8, the program will execute FAT cases. I use
81
* saxpy like implementation (loading X in outer loop and loading A and Yin
82
* innerloop) for this case for better performance. It provides decent
83
* performance in comparison with the scalar implementation.
84
* Yet, I want to maximumize the use of vector ops. So, I implemented 3
85
* blocks: for M==4, M==2, M==1 (where in M==4, I used unroll of 4 and
86
* M==2, I used unroll of 2). Any case with 1<=M<8 will execute like this:
87
* 7=4+2+1, 6=4+2, 5=4+1 ... ...
90
* Here, A is the main bottle neck. So, aligning X or Y would not work. I
91
* beleive, striding access of A ( by lda*4 element) and aligning each
92
* stride of A would improve the performance. I have implemented code for
93
* strided access but can't complete the cleanup part.
94
* -----------------------------------------
96
* |A | | | |A2 | | | | A3 |
98
* | |A | | | |A2| | | |
99
* -----------------------------------------
102
* I will incorporate memory alignment optimization with this code.
107
* I found a strange problem with the vector operation. In my TALL case,
108
* When M is very large and N=odd like, 1,2,3 there is a floating point
109
* precision problem. ATLAS tester fails with diff=0.000001 ~ 0.000005.
110
* I have checked my code for errors but failed to find any. If I am right,
111
* there may be 2 sources : a) using VMLA instruction. b) using vector
112
* register as scalar (like: d1[0]) to store and load.
113
* I want to use VFMA(Vector Fused Multiply Accumulate) which would provide
114
* better precision but the tested hardware doesn't support this. Reference for
116
* http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0489c/CIHEJBIE.html
118
* Please let me know if you find any problem in my code.
119
* RCW: caused by non-IEEE complaint arithmetic of NEON.
123
* Md Majedul Haque Sujon.
143
/* mainly used for indexing, used as other purpose if needed*/
145
/* used mainly to check X addr or Y addr*/
148
/* mainly used for jump location, used for
149
*other purpose if regs are not available
161
void ATL_UGEMV(const int M, const int N, const float *A, const int lda, const
162
1st overflow 2nd overflow arg
171
.globl ATL_asmdecor(ATL_UGEMV)
172
ATL_asmdecor(ATL_UGEMV):
173
.type ATL_asmdecor(ATL_UGEMV), %function
179
/* stmDB SP!, {r4-r11,r14} */
181
#################################
182
/* just for a test. ommited */
183
#################################
185
//push {lastX} // need to add 4 to FSIZE_X, FSIZE_Y
187
//and II, II, lastX /* zero exception bits*/
188
//bic II, II, #(1<<24) /* turn off flush-to-zero*/
191
##############################
198
/* calculate end of X and Y adddress*/
199
add lastX, X, M, LSL #2
200
add lastY, Y, N, LSL #2
205
/* calculate address of As*/
206
add A2, A, lda, LSL #2
207
add A3, A2,lda, LSL #2
208
add A4, A3, lda, LSL #2
214
/* set jump location for TALL case*/
217
/* N<4? goto TALL case directly */
221
/* flag whether it is not only FAT case, need to track for BETA0 */
222
EOR JTARGCY, JTARGCY /* JTARGCY =0 */
224
/* M<8? goto FAT case directly*/
228
################################################
229
/* M >= 8 and N >= 4 */
231
#################################################
234
/* M remaining (M/8)*8 */
237
/* CHKXY contains X address upto remainder*/
238
add CHKXY, X, II, LSL #2
240
/* here JTARGCY is used as end of rounded Y address*/
241
mov JTARGCY, N, LSR #2
242
LSL JTARGCY, JTARGCY, #2
243
add JTARGCY, Y, JTARGCY, LSL #2
245
/* calculate distance to point next A-s*/
250
/* no available int reg, but A is needed to be saved
251
* otherwise need to use complex calc with multplication
252
* so I saved it in stack. ....
259
/* to ommit the EOR in each N loop, I peel one iteration out, and multiply
260
* and save result to these regs. But the effect is so negligible that it
261
* doesn't improve the performance. For simplicity, I skipped those codes here.
271
/* load x and all 4 A addr*/
272
VLD1.32 {d0,d1,d2,d3}, [X]! /* q0, q1 */
273
VLD1.32 {d4,d5,d6,d7},[A]! /* q2, q3 */
274
VLD1.32 {d8,d9,d10,d11},[A2]! /* q4, q5 */
275
VLD1.32 {d12,d13,d14,d15},[A3]! /* q6, q7 */
276
VLD1.32 {d16,d17,d18,d19},[A4]! /* q8, q9 */
299
/* horizontal pairwise add to add all the scalar expansion */
300
VPADD.F32 d28,d20,d21
301
VPADD.F32 d29,d22,d23
302
VPADD.F32 d30,d24,d25
303
VPADD.F32 d31,d26,d27
305
VPADD.F32 d20,d28,d29
306
VPADD.F32 d21,d30,d31
311
VST1.32 {d20-d21},[Y]!
313
VLD1.32 {d22-d23},[Y]
314
VADD.F32 q10, q10, q11
315
VST1.32 {d20-d21},[Y]!
318
/* add (lda-M)*4 to A-s, here is the effective distanc here */
332
#############################################################
333
/* Now, there would be two remaining cases: TALL and FAT
334
* I will call TALL case first
337
* --------------------------------
343
* |----------------------- |
346
* --------------------------------
351
/* what happens to the registers: */
353
/* Now, X == init X + {(M/8)*8}*4 , Y== init Y + ((N/4)*4)*4
354
* lda is changed.. need to restore
356
* A = changed...to next position
359
/* CALL the TALL case: set parameter*/
360
/* restore lda: lda = (lda + II*4 ) / 16 */
361
add lda, lda, II, LSL #2
364
/* X, Y is already set, M would be same, N need to set as remainder*/
365
subs N, lastY, JTARGCY
368
/* set A4 as the original A, as A4 is not used and we may need it in FAT case*/
371
/* parameter for TALL call: */
373
/* Now, X == init X + {(M/8)*8}*4 , Y== init Y + ((N/4)*4)*4
374
* lastX, lastY = unchanged
375
* M, lda = original value
380
ldr JTARGCY, =FAT_REM
381
BNE N_LESS_4 /* N!=0, then goto TALL CASE*/
384
/* CALL the FAT case: */
388
/* First check whether there is any FAT case, if no.. no need to arrange param*/
390
/* M is unchanged from previous operation in TALL*/
392
subs M, M, II, LSL #3 /* M = remainder of M... set flags to check later*/
394
BEQ DONE /* M==0, no FAT case, goto done*/
396
/* arrange parameters */
399
* X == lastX or init X + {(M/8)*8}*4 depending upon the prev call !!!
400
* lastX, lastY = unchanged
404
* A = undef ... depend on condition... need to save before
405
* but A4 can be used to save A, as it is not used in TALL case
410
/* parameter for FAT case should be:
411
* X = initial X + {(M/8)*8}*4
414
* A, A2, A3, A4 = follow X
415
* lastX =original.. not changed
416
* lastY = Y + ((N/4)*4)*4
418
* N = (original N/4)* 4
419
* Need to handle BETA0
422
ldr X,[SP, #FSIZE_X] /* load X again, as X is undefined */
423
ldr Y,[SP, #FSIZE_Y] /* load Y again as Y and N is changed */
426
add X, X, II, LSL #5 /* X = X + II* 32 */
427
add A, A, II, LSL #5 /* A = A + II* 32 */
429
add A2, A, lda, LSL #2
430
add A3, A2,lda, LSL #2
431
add A4, A3, lda, LSL #2
433
sub II, lastY, Y /* II = original N * 4 */
434
mov N, II, LSR #4 /* N = original N/4 */
435
LSL N, N, #2 /* N = (N/4)*4 */
437
add lastY, Y, N , LSL #2
440
/* flag for FAT cases to avoid to store for BETA0 */
444
###############################################################
458
/* ldmIA can be used instead of pop*/
459
/* ldmIA SP!, {r4-r11,r14} */
464
###############################################################
465
/* Special case M>=8(currently M is multiple of 8) N<4 */
468
/* Handled each case separately: n=3,2,1*/
470
###############################################################
475
/* A4 is not used in this case, I reuse it between two calls*/
477
/* M<8 ? goto scalar block*/
485
/* CHKXY contains X address upto remainder*/
486
add CHKXY, X, II, LSL #2
489
/* N < 3? goto N<=2 test */
493
######################################
495
######################################
497
/* assuming lda, A, A2, A3 in correct position*/
498
/* lastX = last element*/
509
/* load x and all 4 A addr*/
510
VLD1.32 {d0,d1,d2,d3}, [X]! /* q0, q1 */
511
VLD1.32 {d16,d17,d18,d19},[A]! /* q8, q9 */
512
VLD1.32 {d20,d21,d22,d23},[A2]! /* q10, q11 */
513
VLD1.32 {d24,d25,d26,d27},[A3]! /* q12, q13 */
536
/* horizontal pairwise add to addup scalar expansion */
538
VPADD.F32 d20, d28, d29
539
VPADD.F32 d21, d30, d31
540
VPADD.F32 d23, d0, d1
542
VPADD.F32 d25, d20, d21
543
VPADD.F32 d26, d23, d24 /* d24 is garbase, not saved in Y*/
547
VST1.32 {d26[0]}, [Y]!
550
VLD1.32 {d29[0]}, [Y]!
552
VADD.F32 d28, d28, d25
554
sub Y, Y, #12 /* restore back the prev value*/
557
VST1.32 {d29[0]}, [Y]!
563
/* set Y to initial position*/
566
/* set effective lda to move around A in loop*/
567
sub II, M, II /* number of remaining element*/
569
/* call clean up to complete */
578
###################################################
581
###################################################
583
/* assuming X, A, A2, in correct position*/
593
/* load x and all 4 A addr*/
594
VLD1.32 {d0,d1,d2,d3}, [X]! /* q0, q1 */
595
VLD1.32 {d18,d19,d20,d21},[A]! /* q9, q10 */
596
VLD1.32 {d22,d23,d24,d25},[A2]! /* q11, q12 */
613
/* horizontal pairwise add using vpadd */
614
/* horizontal add, rd = d18*/
616
VPADD.F32 d17, d14, d15
617
VPADD.F32 d16, d12, d13
618
VPADD.F32 d18, d16, d17
625
VADD.F32 d18, d18, d4
632
/* set Y to initial position*/
636
/* number of remaining element*/
645
######################################################
648
######################################################
650
/* assuming X, A, in correct position*/
657
/* load x and all 4 A addr*/
658
VLD1.32 {d0,d1,d2,d3}, [X]! /* q0, q1 */
659
VLD1.32 {d18,d19,d20,d21},[A]! /* q9, q10 */
673
VPADD.F32 d16, d14, d13 /* d13 garbase but not used to save in Y*/
677
VST1.32 {d16[0]},[Y]!
680
VADD.F32 d18, d16, d17
682
VST1.32 {d18[0]},[Y]!
688
/* set Y to initial position*/
691
sub II, M, II /* number of remaining element*/
693
/* for BETA0, there may be 2 cases:
694
* 1. 1st case falls through from above cases: need to load Y even in BETA0
695
* 2. 2nd case occurs for the 1st time: no load of Y;
698
/* to reduce extra condition check and simplicity
699
* I separated out this two cases
704
/* II is the remainder for X */
706
/* use A2 to precalculate lda distance*/
717
VLD1.32 {d1[0]}, [A]!
723
/* No need to use BETA0 macro as it's cleanup case */
740
###############################################
741
/* M<8 and N<4 and M_N_scalar loop*/
742
/* scaler implementation , not a fall through case*/
747
/* calculate effective lda once before loop */
757
VLD1.32 {d1[0]}, [A]!
783
###############################################################
784
/* FAT CASE ........... M < 8 N>=4*/
785
###############################################################
788
/* All of the below implementation is like SAXPY calculation
789
* main idea: Load all the X at once, now in inner loop load Y,
790
* do computation and store back to Y....
791
* BETA0 need to be handled carefully
794
###############################################################
797
/* need to handle Y clean up for 4,2 case */
798
/* Special case M<8 and N>=4, saxpy like implementation*/
802
/* BETA0, then peel 1st iteration,
803
* for 1st iteration, we need to store without load
808
/* JTARGCY==1, it is not direct case, FAT is called for cleanup
826
add A, A, lda, LSL #2
828
BNE BETA0_M1_PEEL_LOOP
833
/* restore A,A2,A3,A4 to their updated position */
840
/* 1 M iteration is done*/
842
BEQ DONE /* M==0, got to DONE */
849
/* N remaining II= (N/4)*4 */
852
/* CHKXY contains Y address upto remainder*/
853
add CHKXY, Y, II, LSL #2
856
/* M < 4 ? goto M2 case*/
861
############################################################
865
/* M==4, this implementation is better than M4 x N4 unroll... 2 times better*/
867
mov JTARGCY, A /* save A in JTARGCY*/
870
/* load all X at once*/
871
VLD1.32 {d0-d1}, [X]!
880
VLD1.32 {d10,d11},[Y]
893
VPADD.F32 d16,d12,d13
894
VPADD.F32 d17,d14,d15
898
VST1.32 {d16,d17},[Y]!
901
add A, A, lda, LSL #4
902
add A2,A2,lda, LSL #4
903
add A3,A3,lda, LSL #4
904
add A4,A4,lda, LSL #4
910
/* no cleanup? goto M remaining check */
914
/* cleanup for M4 case*/
916
/* Restore X to previous position*/
919
VLD1.32 {d0-d1}, [X]!
925
VPADD.F32 d6,d4,d5 /* d5 garbase, but not used for Y*/
931
add A, A, lda, LSL #2
939
/* restore Y and position A, A2.... */
941
sub Y, CHKXY, II, LSL#2
943
mov A, JTARGCY /* restore A*/
946
add A2, A, lda, LSL #2
948
/* find remaining element for X */
950
BEQ DONE /* M==0, goto DONE*/
953
/* II<2? got to M==1 case*/
962
/* M<2? then goto SCALAR implementation*/
966
#################################################
967
/* case: for M = 7 (4+2+1), 6 (4+2), 2 and N>>4 */
968
/* CHKXY ----> has the addr multiple of 4 of Y */
970
/* A3, A4 is not used,
971
* A, Y can be stored in A3, A4
980
/* X, A multiple of 2 floats*/
996
add A, A, lda, LSL #3
997
add A2,A2,lda, LSL #3
1006
/* check no cleanup for Y, goto M check*/
1011
/* Restore X to previous position */
1020
VPADD.F32 d3,d1,d2 // d2 garbase
1022
VLD1.32 {d4[0]}, [Y]
1024
VST1.32 {d3[0]},[Y]!
1026
add A, A, lda, LSL #2
1030
/* remaining M check*/
1033
/* restote A and Y */
1034
/* mov A to appropriate position... A += 8*/
1039
/* check whether M is done, if not goto SCALAR loops*/
1045
############################################################
1046
/* Complete SCALAR implementation for saxpy like case*/
1047
/* needed for odd case of M*/
1051
VLD1.32 {d0[0]},[X]!
1055
VLD1.32 {d1[0]}, [A]
1058
VLD1.32 {d3[0]}, [Y]
1060
VST1.32 {d3[0]},[Y]!
1062
add A, A, lda, LSL #2