1
/* Fast GEMM routine for Alpha 21164(A) */
2
/* on Linux, Digital UNIX and NT */
3
/* by Kazushige Goto <goto@statabo.rim.or.jp> */
17
/* Internal BLOCK Leading size.*/
21
/* Now starting Main program */
24
#define ROUTINE GEMM_NN
27
#define ROUTINE GEMM_NT
30
#define ROUTINE GEMM_TN
33
#define ROUTINE GEMM_TT
36
#define MATRIX_A ((P>>2)*LDA*SIZE)
37
#define MATRIX_B ((R>>1)*LDB*SIZE)
39
#define STACKSIZE 16*8
51
lda $30, -STACKSIZE($30) # prepare stack
53
/* save original register value */
71
/* restore argument value */
72
ldq $9, 0+STACKSIZE($30) # B
73
ldl $19, 8+STACKSIZE($30) # ldb
74
ldq $13,16+STACKSIZE($30) # C
75
ldl $10,24+STACKSIZE($30) # ldc
76
ldq $28,32+STACKSIZE($30) # buffer
86
subl $18,$3,$7 # min_l = k - ls
87
cmple $7, $0, $1 # (min_l>Q)?
88
cmoveq $1, $0, $7 # if $1 then min_l = Q
91
#if defined(NN) || (!defined(C_VERSION) && defined(TN)) \
92
|| (defined(C_VERSION) && defined(NT))
96
mull $19,$14,$2 # js*ldb
97
subl $17,$14,$12 # min_j = n - js
100
mull $21,$14,$2 # js*ldb
101
subl $16,$14,$12 # min_j = n - js
105
ldah $24, Address_H(MATRIX_A)($28) # a_offset = &sb[0][0]
106
cmple $12, $0, $1 # (min_j>R) ?
107
cmoveq $1, $0, $12 # if $1 then min_j = R
109
lda $24, Address_L(MATRIX_A)($24)
110
mull $10,$14,$25 # jsldc = js * ldc
112
mov $12, $27 # j = min_j
116
SXADDQ $2,$9,$0 # b_offset = b + js*ldb
118
SXADDQ $2,$20,$0 # b_offset = b + js*ldb
123
mov $7, $5 # l = min_l
124
mov $24, $2 # a1_offset = a_offset
126
SXADDQ $3, $0, $4 # b1_offset = b_offset + ls
128
SXADDQ $19, $0, $0 # b_offset += ldb
130
SXADDQ $21, $0, $0 # b_offset += ldb
133
SXADDQ $3, $0, $22 # c1_offset = b_offset + ls
135
SXADDQ $19, $0, $0 # b_offset += ldb
137
SXADDQ $21, $0, $0 # b_offset += ldb
139
lda $24,LDB*SIZE($24) # a_offset += LDB
140
subl $27, 2, $27 # j--
164
lda $2, 16*SIZE($2) # a1_offset += 8
165
subl $5, 8, $5 # l -= 4
166
lda $4, 8*SIZE($4) # b1_offset += 4
167
lda $22, 8*SIZE($22) # c1_offset += 4
169
ST $f10, -16*SIZE($2)
170
ST $f14, -15*SIZE($2)
171
ST $f11, -14*SIZE($2)
172
ST $f15, -13*SIZE($2)
174
ST $f12, -12*SIZE($2)
175
ST $f21, -11*SIZE($2)
176
ST $f13, -10*SIZE($2)
189
bgt $5, $L19 # if l>0 goto $L19
190
bgt $27, $L15 # if j>0 goto $L15
196
mull $19, $3, $2 # ls*ldb
197
subl $17,$14,$12 # min_j = n - js
199
mull $21, $3, $2 # ls*ldb
200
subl $16,$14,$12 # min_j = n - js
204
ldah $24, Address_H(MATRIX_A)($28) # a_offset = &sb[0][0]
205
cmple $12, $0, $1 # (min_j>R) ?
206
cmoveq $1, $0, $12 # if $1 then min_j = R
208
lda $24, Address_L(MATRIX_A)($24)
209
mull $10,$14,$25 # jsldc = js * ldc
211
mov $12, $27 # j = min_j
215
SXADDQ $2,$9,$0 # b_offset = b + ls*ldb
217
SXADDQ $2,$20,$0 # b_offset = b + ls*ldb
222
mov $24, $2 # a1_offset = a_offset
223
mov $7, $5 # l = min_l
225
lda $24, LDB*SIZE($24) # a_offset += LDB
226
SXADDQ $14, $0, $4 # b1_offset = b_offset + js
227
lda $0, 2*SIZE($0) # b_offset += 2
232
LD $f10, 0*SIZE($4) # atemp1 = *(b1_offset+0)
233
LD $f11, 1*SIZE($4) # atemp2 = *(b1_offset+1)
234
SXADDQ $19, $4, $4 # b1_offset += ldb
236
LD $f12, 0*SIZE($4) # atemp3 = *(b1_offset+0)
237
LD $f13, 1*SIZE($4) # atemp4 = *(b1_offset+1)
238
SXADDQ $19, $4, $4 # b1_offset += ldb
240
LD $f10, 0*SIZE($4) # atemp1 = *(b1_offset+0)
241
LD $f11, 1*SIZE($4) # atemp2 = *(b1_offset+1)
242
SXADDQ $21, $4, $4 # b1_offset += ldb
244
LD $f12, 0*SIZE($4) # atemp3 = *(b1_offset+0)
245
LD $f13, 1*SIZE($4) # atemp4 = *(b1_offset+1)
246
SXADDQ $21, $4, $4 # b1_offset += ldb
250
ST $f10, 0*SIZE($2) # *(a1_offset+0) = atemp1
251
ST $f11, 1*SIZE($2) # *(a1_offset+1) = atemp2
252
ST $f12, 2*SIZE($2) # *(a1_offset+3) = atemp3
253
ST $f13, 3*SIZE($2) # *(a1_offset+4) = atemp4
255
lda $2, 4*SIZE($2) # a1_offset += 4
257
bgt $5, $L102 # if l>0 goto $102
259
subl $27, 2, $27 # l--
265
#if defined(NN) || (!defined(C_VERSION) && defined(NT)) \
266
|| (defined(C_VERSION) && defined(TN))
270
mull $21,$3,$2 # lslda = lda * ls
271
subl $16,$11,$15 # min_i = m - is
273
mull $19,$3,$2 # lslda = lda * ls
274
subl $17,$11,$15 # min_i = m - is
276
cmple $15,P, $1 # (min_i>P)?
277
cmoveq $1, P, $15 # if $1 then min_i = P
279
mov $28, $0 # b_offset = &sa[0][0]
280
mov $7, $5 # l = min_l
283
SXADDQ $2,$20,$24 # a_offset = a + lslda
285
SXADDQ $2,$9, $24 # a_offset = a + lslda
290
SXADDQ $11, $24, $2 # a1_offset = a_offset + is
292
SXADDQ $21, $24, $24 # a_offset += lda
294
SXADDQ $19, $24, $24 # a_offset += lda
297
SXADDQ $11, $24, $22 # a1_offset = a_offset + is
299
SXADDQ $21, $24, $24 # a_offset += lda
301
SXADDQ $19, $24, $24 # a_offset += lda
304
mov $0, $4 # b1_offset = b_offset
305
subl $15, 8, $8 # i -= 8
307
lda $0, 8*SIZE($0) # b_offset += 4
330
ble $8,$L38 # if i>0 goto $L39
334
ST $f10, (0*LDA+0)*SIZE($4)
335
ST $f11, (0*LDA+1)*SIZE($4)
336
ST $f12, (0*LDA+2)*SIZE($4)
337
ST $f13, (0*LDA+3)*SIZE($4)
344
ST $f23, (0*LDA+4)*SIZE($4)
345
ST $f24, (0*LDA+5)*SIZE($4)
346
ST $f25, (0*LDA+6)*SIZE($4)
347
ST $f26, (0*LDA+7)*SIZE($4)
351
LD $f25, 10*SIZE($22)
352
LD $f26, 11*SIZE($22)
354
ST $f14, (LDA+0)*SIZE($4)
355
ST $f15, (LDA+1)*SIZE($4)
356
ST $f21, (LDA+2)*SIZE($4)
357
ST $f22, (LDA+3)*SIZE($4)
364
ST $f27, (LDA+4)*SIZE($4)
365
ST $f28, (LDA+5)*SIZE($4)
366
ST $f29, (LDA+6)*SIZE($4)
367
ST $f30, (LDA+7)*SIZE($4)
369
LD $f27, 12*SIZE($22)
370
LD $f28, 13*SIZE($22)
371
LD $f29, 14*SIZE($22)
372
LD $f30, 15*SIZE($22)
374
subl $8, 8, $8 # i -= 8
375
lda $22, 8*SIZE($22) # a1_offset += 4
376
lda $4, 2*LDA*SIZE($4) # b1_offset += LDA
377
lda $2, 8*SIZE($2) # a1_offset += 4
379
bgt $8,$L39 # if i>0 goto $L39
382
ST $f10, (0*LDA+0)*SIZE($4)
383
ST $f11, (0*LDA+1)*SIZE($4)
384
ST $f12, (0*LDA+2)*SIZE($4)
385
ST $f13, (0*LDA+3)*SIZE($4)
387
ST $f23, (0*LDA+4)*SIZE($4)
388
ST $f24, (0*LDA+5)*SIZE($4)
389
ST $f25, (0*LDA+6)*SIZE($4)
390
ST $f26, (0*LDA+7)*SIZE($4)
392
ST $f14, (LDA+0)*SIZE($4)
393
ST $f15, (LDA+1)*SIZE($4)
394
ST $f21, (LDA+2)*SIZE($4)
395
ST $f22, (LDA+3)*SIZE($4)
397
ST $f27, (LDA+4)*SIZE($4)
398
ST $f28, (LDA+5)*SIZE($4)
399
ST $f29, (LDA+6)*SIZE($4)
400
ST $f30, (LDA+7)*SIZE($4)
401
bgt $5, $L35 # if l>0 goto $L35
409
mull $11, $21, $2 # is*lda
410
subl $16, $11, $15 # min_i = m - is
412
mull $11, $19, $2 # is*lda
413
subl $17, $11, $15 # min_i = m - is
416
cmple $15,P, $1 # (min_i>P)?
417
cmoveq $1, P, $15 # if $1 then min_i = P
420
SXADDQ $2, $20, $6 # c_offset = a + is*lda
421
s4addq $21, 0, $1 # j = 4*lda
423
SXADDQ $2, $9, $6 # c_offset = a + is*lda
424
s4addq $19, 0, $1 # j = 4*lda
427
mov $28, $0 # b_offset = &sa[0][0]
428
mov $15, $8 # i = min_i
432
mov $0, $4 # b1_offset = b_offset
433
lda $0, LDA*SIZE($0) # b_offset += LDA
435
SXADDQ $3, $6, $24 # a_offset = c_offset + ls
436
mov $7, $5 # l = min_l
440
mov $24, $2 # a1_offset = a_offset
441
lda $24, 4*SIZE($24) # a_offset += 4
449
SXADDQ $21, $2, $2 # a1_offset += lda
451
SXADDQ $19, $2, $2 # a1_offset += lda
460
SXADDQ $21, $2, $2 # a1_offset += lda
462
SXADDQ $19, $2, $2 # a1_offset += lda
471
SXADDQ $21, $2, $2 # a1_offset += lda
473
SXADDQ $19, $2, $2 # a1_offset += lda
481
ST $f10, 0*SIZE($4) # atemp1
482
ST $f14, 1*SIZE($4) # atemp5
483
ST $f23, 2*SIZE($4) # ctemp1
484
ST $f27, 3*SIZE($4) # ctemp5
486
ST $f11, 4*SIZE($4) # atemp2
487
ST $f15, 5*SIZE($4) # atemp6
488
ST $f24, 6*SIZE($4) # ctemp2
489
ST $f28, 7*SIZE($4) # ctemp6
491
ST $f12, 8*SIZE($4) # atemp3
492
ST $f21, 9*SIZE($4) # atemp7
493
ST $f25, 10*SIZE($4) # ctemp3
494
ST $f29, 11*SIZE($4) # ctemp7
496
ST $f13, 12*SIZE($4) # atemp4
497
ST $f22, 13*SIZE($4) # atemp8
498
ST $f26, 14*SIZE($4) # ctemp4
499
ST $f30, 15*SIZE($4) # ctemp8
501
lda $4, 16*SIZE($4) # b1_offset += 16
503
subl $5, 4, $5 # l -= 4
506
subl $8, 4, $8 # i -= 4
507
SXADDQ $1, $6, $6 # c_offset += j
514
sra $12, 1, $27 # j = (min_j>>1)
515
ldah $26, Address_H(MATRIX_A)($28) # a_offset = &sb[0][0]
516
lda $26, Address_L(MATRIX_A)($26)
517
SXADDQ $25,$13,$6 # c_offset = c + jsldc;
526
mov $26, $0 # b_offset = b_orig
528
lda $26, SIZE*LDB($26) # b_orig += LDB
531
mov $28, $24 # a_offset = &sa[0][0]
533
sra $15, 2, $8 # i = (min_i>>1)
536
SXADDQ $11, $6, $22 # c1_offset = c_offset + is
538
SXADDQ $10, $6, $6 # c_offset += ldc(2 times)
541
SXADDQ $10, $6, $6 # c_offset += ldc
544
ble $8,$L52 # if j<=0 goto $L52
548
mov $24, $2 # a1_offset = a_offset
550
sra $7, 2, $5 # l = (min_l>>2)
553
mov $0, $4 # b1_offset = b_offset
555
SXADDQ $10, $22, $23 # c2_offset = c1_offset + ldc
558
lda $24, LDA*SIZE($24) # a_offset += LDA
561
ble $5, $L55 # if l<=0 goto $L55
563
LD $f1, 0*SIZE($4) # btemp1 = *(b1_offset+0)
564
LD $f10, 1*SIZE($4) # btemp2 = *(b1_offset+1)
565
LD $f24, 2*SIZE($4) # btemp3 = *(b1_offset+2)
566
LD $f23, 3*SIZE($4) # btemp4 = *(b1_offset+3)
568
LD $f22, 0*SIZE($2) # atemp1
569
LD $f20, 1*SIZE($2) # atemp2
570
LD $f18, 2*SIZE($2) # atemp3
571
LD $f16, 3*SIZE($2) # atemp4
573
LD $f17, 4*SIZE($2) # atemp5
574
LD $f0, 5*SIZE($2) # atemp6
575
LD $f19, 6*SIZE($2) # atemp7
576
LD $f2, 7*SIZE($2) # atemp8
578
lda $2, 8*SIZE($2) # a1_offset += 8
580
addq $4, 4*SIZE, $4 # b1_offset += 2
581
ble $5, $L57 # if l<=0 goto $L57
586
This loop is very important and affects calculating speed directry.
587
Each load waits 12 clocks which is enough to load from 2nd cache.
588
And one loop takes 33 clocks(multiply and add = 32 clock plus jump
589
latency = 1 clock). If you have 21164A with 600MHz machine, it'll
590
take 1163.6 MFlops in this loop. This is theoretical value, but it's
591
pretty fast, isn't it?
598
ldt $f31, 32*SIZE($4) # prefetch
600
ldt $f31, 36*SIZE($4) # prefetch
604
ldt $f31, 44*SIZE($2) # prefetch
685
LD $f4, 3*SIZE($4) # modified
709
lda $2, 16*SIZE($2) # a1_offset += 16
718
addt $f12, $f11, $f12
745
LD $f10, 5*SIZE($4) # modifiled
792
lds $f31, 8*SIZE($22) # prefetch(modify intent)
795
lds $f31, 12*SIZE($23) # prefetch(modify intent)
831
lda $2, 4*SIZE($2) # a1_offset += 4
868
lda $2, 4*SIZE($2) # a1_offset += 4
880
addt $f12, $f11, $f12
882
addt $f25, $f13, $f25
900
addt $f12, $f11, $f12
902
addt $f25, $f13, $f25
905
addt $f26, $f14, $f26
907
addt $f27, $f15, $f27
912
and $7,3,$5 # l = (min_l&3)
913
beq $5,$L60 # if l<=0 goto $L60
915
LD $f22, 0*SIZE($2) # atemp1
916
LD $f1, 0*SIZE($4) # btemp1
918
LD $f20, 1*SIZE($2) # atemp2
919
LD $f10, 1*SIZE($4) # btemp2
920
LD $f18, 2*SIZE($2) # atemp3
921
LD $f16, 3*SIZE($2) # atemp4
923
lda $2, 4*SIZE($2) # a1_offset += 4
925
lda $4, 2*SIZE($4) # b1_offset += 2
926
ble $5,$L62 # if l<=0 goto $L62
945
lda $2, 4*SIZE($2) # a1_offset += 4
950
lda $4, 2*SIZE($4) # b1_offset += 2
969
bgt $5,$L63 # if l>0 goto $L63
999
LD $f20, 1*SIZE($22) # atemp2 = *(c1_offset+1)
1001
LD $f18, 2*SIZE($22) # atemp3 = *(c1_offset+2)
1004
LD $f16, 3*SIZE($22) # atemp4 = *(c1_offset+3)
1006
LD $f0, 1*SIZE($23) # atemp6 = *(c2_offset+1)
1009
LD $f19, 2*SIZE($23) # atemp7 = *(c2_offset+2)
1011
LD $f2, 3*SIZE($23) # atemp8 = *(c2_offset+3)
1019
addq $22, 4*SIZE, $22 # c1_offset += 4
1023
addt $f20, $f13, $f25
1024
mult $f3, $f29, $f13
1025
addt $f18, $f14, $f26
1026
mult $f3, $f30, $f14
1031
ST $f12, -4*SIZE($22) # *(c1_offset+0) = ctemp1
1035
ST $f25, -3*SIZE($22) # *(c1_offset+1) = ctemp2
1040
ST $f26, -2*SIZE($22) # *(c1_offset+2) = ctemp3
1045
ST $f27, -1*SIZE($22) # *(c1_offset+3) = ctemp4
1048
ST $f28, 0*SIZE($23) # *(c2_offset+0) = ctemp5
1053
ST $f29, 1*SIZE($23) # *(c2_offset+1) = ctemp6
1058
ST $f30, 2*SIZE($23) # *(c2_offset+2) = ctemp7
1060
ST $f21, 3*SIZE($23) # *(c2_offset+3) = ctemp8
1061
bgt $8,$L54 # if i>0 goto $L54
1065
and $15,3,$8 # i = (min_i&3)
1066
beq $8,$L49 # if i<0 goto $L49
1072
mov $24, $2 # a1_offset = a_offset
1073
addq $24, SIZE, $24 # a_offset ++
1074
mov $0, $4 # b1_offset = b_offset
1075
SXADDQ $10, $22, $23 # c2_offset = c1_offset + ldc
1077
fclr $f12 # ctemp1 = ZERO
1078
mov $7, $5 # l = min_l
1079
fclr $f28 # ctemp5 = ZERO
1080
ble $7,$L71 # if l<=0 goto $L71
1084
LD $f22, 0($2) # atemp1
1085
lda $2, 4*SIZE($2) # a1_offset += 4
1087
LD $f1, 0*SIZE($4) # btemp1 = *(b1_offset+0)
1088
LD $f10, 1*SIZE($4) # btemp2 = *(b1_offset+2)
1090
mult $f1, $f22,$f11 # temp1 = btemp1 * atemp1
1091
mult $f10,$f22,$f13 # temp2 = btemp2 * atemp1
1092
lda $4, 2*SIZE($4) # b1_offset ++
1095
addt $f12,$f11,$f12 # ctemp1 = ctemp1 + temp1
1096
addt $f28,$f13,$f28 # ctemp5 = ctemp5 + temp2
1097
bgt $5,$L73 # if l>0 goto $L73
1102
mult $f3,$f12,$f11 # temp1 = alpha*ctemp1
1103
mult $f3,$f28,$f13 # temp2 = alpha*ctemp2
1104
LD $f22, 0*SIZE($22) # atemp1 = *c1_offset
1105
LD $f17, 0*SIZE($23) # atemp5 = *c2_offset
1107
addt $f22,$f11,$f12 # ctemp1 = atemp1 + temp1
1108
addt $f17,$f13,$f28 # ctemp2 = atemp5 + temp2
1110
ST $f12, 0*SIZE($22) # *c1_offset = ctemp1
1112
ST $f28, 0*SIZE($23) # *c2_offset = ctemp2
1113
addq $22, SIZE, $22 # c1_offset ++
1114
bgt $8,$L69 # if i>0 goto $L69
1119
subl $27,1,$27 # j --
1120
bgt $27,$L50 # if j>0 goto $L50
1126
blbc $12,$L29 # if (!min_j&1) goto $L29
1128
sra $15,2,$8 # i = (min_i>>2)
1131
mov $26, $0 # b_offset = b_orig
1133
mov $28, $24 # a_offset = &sa[0][0]
1136
SXADDQ $11, $6, $22 # c1_offset = c_offset + is
1137
ble $8,$L79 # if i<0 goto $L79
1143
mov $24, $2 # a1_offset = a_offset
1145
sra $7, 2, $5 # l = (min_l>>2)
1148
mov $0, $4 # b1_offset = b_offset
1150
lda $24, LDA*SIZE($24) # a_offset += LDA
1156
LD $f1, 0*SIZE($4) # btemp1
1157
LD $f10, 2*SIZE($4) # btemp2
1159
LD $f22, 0*SIZE($2) # atemp1
1160
LD $f20, 1*SIZE($2) # atemp2
1161
LD $f18, 2*SIZE($2) # atemp3
1162
LD $f16, 3*SIZE($2) # atemp4
1163
LD $f17, 4*SIZE($2) # atemp5
1164
LD $f0, 5*SIZE($2) # atemp6
1165
LD $f19, 6*SIZE($2) # atemp7
1166
LD $f2, 7*SIZE($2) # atemp8
1168
lda $2, 8*SIZE($2) # a1_offset += 8
1170
subl $5, 1, $5 # l--
1171
lda $4, 4*SIZE($4) # b1_offset += 4
1172
ble $5, $L84 # if l<= 0 goto $L84
1178
LD $f22, 0*SIZE($2) # atemp1
1183
LD $f20, 1*SIZE($2) # atemp2
1188
LD $f18, 2*SIZE($2) # atemp3
1193
LD $f1, 0*SIZE($4) # btemp1
1196
LD $f16, 3*SIZE($2) # atemp4
1198
LD $f17, 4*SIZE($2) # atemp5
1203
LD $f0, 5*SIZE($2) # atemp6
1208
LD $f19, 6*SIZE($2) # atemp7
1213
LD $f10, 2*SIZE($4) # btemp2
1216
LD $f2, 7*SIZE($2) # atemp8
1218
LD $f22, 8*SIZE($2) # atemp1
1223
LD $f20, 9*SIZE($2) # atemp2
1228
LD $f18, 10*SIZE($2) # atemp3
1231
lda $2, 16*SIZE($2) # a1_offset += 16
1233
LD $f1, 4*SIZE($4) # btemp1
1236
LD $f16, -5*SIZE($2) # atemp4
1238
LD $f17, -4*SIZE($2) # atemp5
1243
LD $f0, -3*SIZE($2) # atemp6
1248
LD $f19, -2*SIZE($2) # atemp7
1253
LD $f10, 6*SIZE($4) # btemp2
1255
LD $f2, -1*SIZE($2) # atemp8
1256
subl $5, 1, $5 # l--
1257
lda $4, 8*SIZE($4) # b1_offset += 8
1258
bgt $5,$L85 # if l>0 goto $L85
1264
LD $f22, 0*SIZE($2) # atemp1
1269
LD $f20, 1*SIZE($2) # atemp2
1274
LD $f18, 2*SIZE($2) # atemp3
1279
LD $f1, 0*SIZE($4) # btemp1
1282
LD $f16, 3*SIZE($2) # atemp4
1284
LD $f17, 4*SIZE($2) # atemp5
1289
LD $f0, 5*SIZE($2) # atemp6
1294
LD $f19, 6*SIZE($2) # atemp7
1297
lda $2, 8*SIZE($2) # a1_offset += 8
1298
mult $f10, $f2, $f15
1299
LD $f10, 2*SIZE($4) # b2_offset
1302
LD $f2, -1*SIZE($2) # atemp8
1304
lda $4, 4*SIZE($4) # b1_offset ++
1328
and $7, 3, $5 # l = (min_l&3)
1329
beq $5, $L87 # if l<=0 goto $L87
1331
LD $f22, 0*SIZE($2) # atemp1
1332
LD $f1, 0*SIZE($4) # b1_offset
1334
LD $f20, 1*SIZE($2) # atemp2
1335
LD $f18, 2*SIZE($2) # atemp3
1336
LD $f16, 3*SIZE($2) # atemp4
1337
lda $2, 4*SIZE($2) # a1_offset += 4
1339
subl $5, 1, $5 # l--
1340
lda $4, 2*SIZE($4) # b1_offset += 2
1342
ble $5,$L89 # if l<=0 goto $L89
1350
LD $f22, 0*SIZE($2) # atemp1
1355
LD $f20, 1*SIZE($2) # atemp2
1360
LD $f18, 2*SIZE($2) # atemp3
1363
lda $2, 4*SIZE($2) # a1_offset += 4
1365
LD $f1, 0*SIZE($4) # btemp1
1367
LD $f16, -1*SIZE($2) # atemp4
1368
subl $5, 1, $5 # l--
1369
lda $4, 2*SIZE($4) # b1_offset += 2
1370
bgt $5, $L90 # if l>0 goto $L90
1387
LD $f22, 0*SIZE($22)
1389
LD $f20, 1*SIZE($22)
1392
LD $f18, 2*SIZE($22)
1394
LD $f16, 3*SIZE($22)
1406
ST $f12, 0*SIZE($22)
1408
ST $f25, 1*SIZE($22)
1410
ST $f26, 2*SIZE($22)
1412
ST $f27, 3*SIZE($22)
1415
subl $8, 1, $8 # i--
1416
addq $22, 4*SIZE, $22 # c1_offset += 4
1422
and $15,3,$8 # i = (min_i&3)
1423
beq $8,$L29 # if i<=0 goto $L29
1429
mov $24, $2 # a1_offset = a_offset
1431
addq $24, SIZE, $24 # a_offset ++
1434
mov $0, $4 # b1_offset = b_offset
1435
mov $7, $5 # l = min_l
1436
ble $7,$L98 # if l<=0 goto $L98
1441
LD $f22, 0*SIZE($2) # atemp1
1442
LD $f1, 0*SIZE($4) # btemp1
1444
lda $2, 4*SIZE($2) # a1_offset += 4
1445
lda $4, 2*SIZE($4) # b1_offset += 2
1450
bgt $5,$L100 # if l>0 goto $L100
1454
addt $f12,$f11,$f12 # ctemp1 += temp1
1455
mult $f3,$f12,$f11 # temp1 = alpha*ctemp1
1456
LD $f22, 0*SIZE($22) # atemp1 = *c1_offset
1457
addt $f22,$f11,$f12 # ctemp1 = atemp1 + temp1
1460
ST $f12, 0*SIZE($22) # *c1_offset = ctemp1
1461
addq $22, SIZE, $22 # c1_offset ++
1462
bgt $8,$L96 # if i>0 goto $L98
1466
lda $11, P($11) # is += P
1469
cmplt $11,$16,$1 # is < m ?
1471
cmplt $11,$17,$1 # is < m ?
1477
lda $14, R($14) # js += R
1480
cmplt $14,$17,$1 # js < n ?
1482
cmplt $14,$16,$1 # js < n ?
1488
lda $3, Q($3) # ls += Q
1490
cmplt $3,$18,$1 # ls < k ??
1511
lda $30, STACKSIZE($30)