3
// y(i) = y(i) + a * x(i)
6
// modification of daxpy, using pipelined loads of x values
7
// reads two elements beyond end of array, assumes strides of 1
8
// based on code by Dave Scott, but adapted for use with
9
// matrix diagonaliser.
17
// r30 -1 constant for bla
38
ld.l r0(r16),r16 // get n
39
adds -1,r0,r30 // -1 for bla
40
adds -8,r19,r19 // correct y pointer
41
shr 3,r16,r20 // loop count = n/8
42
bte r20,r0,finale // bypass main loop for N < 8
43
nop // align for dual mode
44
fst.q f0,-64(sp)++ // save regs
48
d.pfiadd.dd f0,f0,f0 // initiate dual mode
49
pfld.d r0(r17),f0 // discard, load A
51
adds -1,r20,r20 // adjust loop count for bla
53
pfld.d r0(r18),f0 // discard, read x1
55
pfld.d 8(r18)++,f0 // discard, read x2
57
bla r30,r20,loop // prime LCC
59
pfld.d 8(r18)++,f2 // load A, read x3
61
fld.d 8(r19),f16 // read y1
63
pfld.d 8(r18)++,f4 // load x1, read x4
65
fld.d 16(r19),f18 // read y2
67
pfld.d 8(r18)++,f6 // load x2, read x5
68
d.pfmul.dd f2,f4,f0 // - start a*x1
69
fld.d 24(r19),f20 // read y3
71
pfld.d 8(r18)++,f8 // load x3, read x6
72
d.pfmul.dd f2,f6,f0 // - start a*x2
73
fld.d 32(r19),f22 // read y4
75
pfld.d 8(r18)++,f10 // load x4, read x7
76
d.pfmul.dd f2,f8,f12 // store a*x1, start a*x3
77
fld.d 40(r19),f24 // read y5
78
d.pfadd.dd f16,f12,f0 // - start (a*x1)+y1
79
pfld.d 8(r18)++,f4 // load x5, read x8
80
d.pfmul.dd f2,f10,f12 // store a*x2, start a*x4
81
fld.d 48(r19),f26 // read y6
82
d.pfadd.dd f18,f12,f0 // - start (a*x2)+y2
83
pfld.d 8(r18)++,f6 // load x6, read x1'
84
d.pfmul.dd f2,f4,f12 // store a*x3, start a*x5
85
fld.d 56(r19),f28 // read y7
86
d.pfadd.dd f20,f12,f0 // - start (a*x3)+y3
87
pfld.d 8(r18)++,f8 // load x7, read x2'
88
d.pfmul.dd f2,f6,f12 // store a*x4, start a*x6
89
fld.d 64(r19),f30 // read y8
90
d.pfadd.dd f22,f12,f16 // store y1 start (a*x4)+y4
91
pfld.d 8(r18)++,f10 // load x8, read x3'
92
d.pfmul.dd f2,f8,f12 // store a*x5, start a*x7
93
fst.d f16,8(r19)++ // save y1
94
d.pfadd.dd f24,f12,f18 // store y2 start (a*x5)+y5
96
d.pfmul.dd f2,f10,f12 // store a*x6 start a*x8
98
d.pfadd.dd f26,f12,f20 // store y3 start (a*x6)+y6
100
d.pfmul.dd f0,f0,f12 // store a*x7 flush
101
fst.d f18,8(r19)++ // save y2
102
d.pfadd.dd f28,f12,f22 // store y4, start (a*x7)+y7
103
fst.d f20,8(r19)++ // save y3
104
d.pfmul.dd f0,f0,f12 // store a*x8 flush
105
fst.d f22,8(r19)++ // save y4
106
d.pfadd.dd f30,f12,f24 // store y5, start (a*x8)+y8
107
fst.d f24,8(r19)++ // save y5
108
d.pfadd.dd f0,f0,f26 // store y6, flush
109
fst.d f26,8(r19)++ // save y6
110
d.pfadd.dd f0,f0,f28 // store y7, flush
111
fst.d f28,8(r19)++ // save y7
112
d.pfadd.dd f0,f0,f30 // store y8, flush
113
bla r30,r20,loop // loop control
115
fst.d f30,8(r19)++ // save y8
117
// exit dual mode, restore fp regs
126
adds -16,r18,r18 // correct overrun for remainder
128
and 7,r16,r16 // mask off to get remainder
129
bc exit // skip if nothing left
130
adds r30,r16,r16 // pre-adjust counter
131
adds -8,r18,r18 // adjust x pointer
133
fld.d r0(r17),f16 // re-load a
135
fld.d 8(r18)++,f18 // load x value
136
fld.d 8(r19),f20 // load y value
137
fmul.dd f16,f18,f18 // a * x(i)
138
fadd.dd f18,f20,f20 // + y(i)
140
fst.d f20,8(r19)++ // ...storing updated value