1
#include "atlas_misc.h"
2
#include "atlas_prefetch.h"
3
#define RTYPE register TYPE
5
#if defined(__GNUC__) || \
6
(defined(__STDC_VERSION__) && (__STDC_VERSION__/100 >= 1999))
7
#define ATL_SINLINE static inline
9
#define ATL_SINLINE static
11
#if defined(ATL_AVX) && defined(DREAL)
14
#include <immintrin.h>
16
* Subtract off x0...x3 contribution to all remaining equations using a
17
* rank-4 update with mu=4, nu=3, ku=4. This version is for 16 AVX regs.
18
* nu is the # of RHS, ku is the number of equations solved, and mu is
19
* unrolled only to enable software pipelinine of load/use.
20
* Loop order is MKN, so that B is kept completely in registers, and
21
* C and A are streamed in (and out, for C) from cache during the operation.
23
ATL_SINLINE void ATL_rk4(ATL_CINT M, const TYPE *A, ATL_CINT lda,
24
TYPE *pB0, ATL_CINT ldb, TYPE *C, ATL_CINT ldc)
26
const TYPE *pA0 = A, *pA1 = A+lda,
27
*pA2 = A+((lda)<<1), *pA3=pA1+((lda)<<1);
28
TYPE *pC0 = C, *pC1 = C+ldc, *pC2 = C+((ldc)<<1);
29
ATL_CINT MM = (M & 4) ? M-4 : M-8;
31
register __m256d rB00, rB01, rB02;
32
register __m256d rB20, rB21, rB22;
33
register __m256d rC00, rC01, rC02;
34
register __m256d rC40, rC41, rC42;
35
register __m256d rA0, rA1;
39
rB00 = _mm256_broadcast_pd((void*)pB0); /* B10 B00 B10 B00 */
40
rB20 = _mm256_broadcast_pd((void*)(pB0+2));
41
rB01 = _mm256_broadcast_pd((void*)(pB0+ldb));
42
rB21 = _mm256_broadcast_pd((void*)(pB0+ldb+2));
43
rB02 = _mm256_broadcast_pd((void*)(pB0+ldb+ldb));
44
rB22 = _mm256_broadcast_pd((void*)(pB0+ldb+ldb+2));
46
rC00 = _mm256_load_pd(pC0); /* C30 C20 C10 C00 */
47
rC01 = _mm256_load_pd(pC1);
48
rC02 = _mm256_load_pd(pC2);
49
rA0 = _mm256_load_pd(pA0); /* A30 A20 A10, A00 */
50
for (i=0; i < MM; i += 8, pA0 += 8, pA1 += 8, pA2 += 8, pA3 += 8,
51
pC0 += 8, pC1 += 8, pC2 += 8)
55
rB = _mm256_unpacklo_pd(rB00, rB00);
56
rB = _mm256_mul_pd(rB, rA0);
57
rC00 = _mm256_sub_pd(rC00, rB); rA1 = _mm256_load_pd(pA1);
58
rB = _mm256_unpacklo_pd(rB01, rB01);
59
rB = _mm256_mul_pd(rB, rA0);
60
rC01 = _mm256_sub_pd(rC01, rB); rC40 =_mm256_load_pd(pC0+4);
61
rB = _mm256_unpacklo_pd(rB02, rB02);
62
rB = _mm256_mul_pd(rB, rA0);
63
rC02 = _mm256_sub_pd(rC02, rB); rA0 = _mm256_load_pd(pA2);
65
rB = _mm256_unpackhi_pd(rB00, rB00);
66
rB = _mm256_mul_pd(rB, rA1);
67
rC00 = _mm256_sub_pd(rC00, rB); rC41 =_mm256_load_pd(pC1+4);
68
rB = _mm256_unpackhi_pd(rB01, rB01);
69
rB = _mm256_mul_pd(rB, rA1);
70
rC01 = _mm256_sub_pd(rC01, rB); rC42 =_mm256_load_pd(pC2+4);
71
rB = _mm256_unpackhi_pd(rB02, rB02);
72
rB = _mm256_mul_pd(rB, rA1);
73
rC02 = _mm256_sub_pd(rC02, rB); rA1 = _mm256_load_pd(pA3);
75
rB = _mm256_unpacklo_pd(rB20, rB20);
76
rB = _mm256_mul_pd(rB, rA0);
77
rC00 = _mm256_sub_pd(rC00, rB);
78
rB = _mm256_unpacklo_pd(rB21, rB21);
79
rB = _mm256_mul_pd(rB, rA0);
80
rC01 = _mm256_sub_pd(rC01, rB);
81
rB = _mm256_unpacklo_pd(rB22, rB22);
82
rB = _mm256_mul_pd(rB, rA0);
83
rC02 = _mm256_sub_pd(rC02, rB); rA0 = _mm256_load_pd(pA0+4);
85
rB = _mm256_unpackhi_pd(rB20, rB20);
86
rB = _mm256_mul_pd(rB, rA1);
87
rC00 = _mm256_sub_pd(rC00, rB); _mm256_store_pd(pC0, rC00);
88
rB = _mm256_unpackhi_pd(rB21, rB21);
89
rB = _mm256_mul_pd(rB, rA1);
90
rC01 = _mm256_sub_pd(rC01, rB); _mm256_store_pd(pC1, rC01);
91
rB = _mm256_unpackhi_pd(rB22, rB22);
92
rB = _mm256_mul_pd(rB, rA1);
93
rC02 = _mm256_sub_pd(rC02, rB); rA1 = _mm256_load_pd(pA1+4);
97
rB = _mm256_unpacklo_pd(rB00, rB00);
98
rB = _mm256_mul_pd(rB, rA0);
99
rC40 = _mm256_sub_pd(rC40, rB); _mm256_store_pd(pC2, rC02);
100
rB = _mm256_unpacklo_pd(rB01, rB01);
101
rB = _mm256_mul_pd(rB, rA0);
102
rC41 = _mm256_sub_pd(rC41, rB); rC00 = _mm256_load_pd(pC0+8);
103
rB = _mm256_unpacklo_pd(rB02, rB02);
104
rB = _mm256_mul_pd(rB, rA0);
105
rC42 = _mm256_sub_pd(rC42, rB); rA0 = _mm256_load_pd(pA2+4);
107
rB = _mm256_unpackhi_pd(rB00, rB00);
108
rB = _mm256_mul_pd(rB, rA1);
109
rC40 = _mm256_sub_pd(rC40, rB); rC01 = _mm256_load_pd(pC1+8);
110
rB = _mm256_unpackhi_pd(rB01, rB01);
111
rB = _mm256_mul_pd(rB, rA1);
112
rC41 = _mm256_sub_pd(rC41, rB); rC02 = _mm256_load_pd(pC2+8);
113
rB = _mm256_unpackhi_pd(rB02, rB02);
114
rB = _mm256_mul_pd(rB, rA1);
115
rC42 = _mm256_sub_pd(rC42, rB); rA1 = _mm256_load_pd(pA3+4);
117
rB = _mm256_unpacklo_pd(rB20, rB20);
118
rB = _mm256_mul_pd(rB, rA0);
119
rC40 = _mm256_sub_pd(rC40, rB);
120
rB = _mm256_unpacklo_pd(rB21, rB21);
121
rB = _mm256_mul_pd(rB, rA0);
122
rC41 = _mm256_sub_pd(rC41, rB);
123
rB = _mm256_unpacklo_pd(rB22, rB22);
124
rB = _mm256_mul_pd(rB, rA0);
125
rC42 = _mm256_sub_pd(rC42, rB); rA0 = _mm256_load_pd(pA0+8);
127
rB = _mm256_unpackhi_pd(rB20, rB20);
128
rB = _mm256_mul_pd(rB, rA1);
129
rC40 = _mm256_sub_pd(rC40, rB); _mm256_store_pd(pC0+4, rC40);
130
rB = _mm256_unpackhi_pd(rB21, rB21);
131
rB = _mm256_mul_pd(rB, rA1);
132
rC41 = _mm256_sub_pd(rC41, rB); _mm256_store_pd(pC1+4, rC41);
133
rB = _mm256_unpackhi_pd(rB22, rB22);
134
rB = _mm256_mul_pd(rB, rA1);
135
rC42 = _mm256_sub_pd(rC42, rB); _mm256_store_pd(pC2+4, rC42);
138
* Drain C load/use pipeline
140
if (M-MM == 4) /* drain pipe over 1 iteration */
144
rB = _mm256_unpacklo_pd(rB00, rB00);
145
rB = _mm256_mul_pd(rB, rA0);
146
rC00 = _mm256_sub_pd(rC00, rB); rA1 = _mm256_load_pd(pA1);
147
rB = _mm256_unpacklo_pd(rB01, rB01);
148
rB = _mm256_mul_pd(rB, rA0);
149
rC01 = _mm256_sub_pd(rC01, rB);
150
rB = _mm256_unpacklo_pd(rB02, rB02);
151
rB = _mm256_mul_pd(rB, rA0);
152
rC02 = _mm256_sub_pd(rC02, rB); rA0 = _mm256_load_pd(pA2);
154
rB = _mm256_unpackhi_pd(rB00, rB00);
155
rB = _mm256_mul_pd(rB, rA1);
156
rC00 = _mm256_sub_pd(rC00, rB);
157
rB = _mm256_unpackhi_pd(rB01, rB01);
158
rB = _mm256_mul_pd(rB, rA1);
159
rC01 = _mm256_sub_pd(rC01, rB);
160
rB = _mm256_unpackhi_pd(rB02, rB02);
161
rB = _mm256_mul_pd(rB, rA1);
162
rC02 = _mm256_sub_pd(rC02, rB); rA1 = _mm256_load_pd(pA3);
164
rB = _mm256_unpacklo_pd(rB20, rB20);
165
rB = _mm256_mul_pd(rB, rA0);
166
rC00 = _mm256_sub_pd(rC00, rB);
167
rB = _mm256_unpacklo_pd(rB21, rB21);
168
rB = _mm256_mul_pd(rB, rA0);
169
rC01 = _mm256_sub_pd(rC01, rB);
170
rB = _mm256_unpacklo_pd(rB22, rB22);
171
rB = _mm256_mul_pd(rB, rA0);
172
rC02 = _mm256_sub_pd(rC02, rB);
174
rB = _mm256_unpackhi_pd(rB20, rB20);
175
rB = _mm256_mul_pd(rB, rA1);
176
rC00 = _mm256_sub_pd(rC00, rB); _mm256_store_pd(pC0, rC00);
177
rB = _mm256_unpackhi_pd(rB21, rB21);
178
rB = _mm256_mul_pd(rB, rA1);
179
rC01 = _mm256_sub_pd(rC01, rB); _mm256_store_pd(pC1, rC01);
180
rB = _mm256_unpackhi_pd(rB22, rB22);
181
rB = _mm256_mul_pd(rB, rA1);
182
rC02 = _mm256_sub_pd(rC02, rB); _mm256_store_pd(pC2, rC02);
184
else /* M-MM = 8, drain pipe over 2 iterations */
188
rB = _mm256_unpacklo_pd(rB00, rB00);
189
rB = _mm256_mul_pd(rB, rA0);
190
rC00 = _mm256_sub_pd(rC00, rB); rA1 = _mm256_load_pd(pA1);
191
rB = _mm256_unpacklo_pd(rB01, rB01);
192
rB = _mm256_mul_pd(rB, rA0);
193
rC01 = _mm256_sub_pd(rC01, rB); rC40 =_mm256_load_pd(pC0+4);
194
rB = _mm256_unpacklo_pd(rB02, rB02);
195
rB = _mm256_mul_pd(rB, rA0);
196
rC02 = _mm256_sub_pd(rC02, rB); rA0 = _mm256_load_pd(pA2);
198
rB = _mm256_unpackhi_pd(rB00, rB00);
199
rB = _mm256_mul_pd(rB, rA1);
200
rC00 = _mm256_sub_pd(rC00, rB); rC41 =_mm256_load_pd(pC1+4);
201
rB = _mm256_unpackhi_pd(rB01, rB01);
202
rB = _mm256_mul_pd(rB, rA1);
203
rC01 = _mm256_sub_pd(rC01, rB); rC42 =_mm256_load_pd(pC2+4);
204
rB = _mm256_unpackhi_pd(rB02, rB02);
205
rB = _mm256_mul_pd(rB, rA1);
206
rC02 = _mm256_sub_pd(rC02, rB); rA1 = _mm256_load_pd(pA3);
208
rB = _mm256_unpacklo_pd(rB20, rB20);
209
rB = _mm256_mul_pd(rB, rA0);
210
rC00 = _mm256_sub_pd(rC00, rB);
211
rB = _mm256_unpacklo_pd(rB21, rB21);
212
rB = _mm256_mul_pd(rB, rA0);
213
rC01 = _mm256_sub_pd(rC01, rB);
214
rB = _mm256_unpacklo_pd(rB22, rB22);
215
rB = _mm256_mul_pd(rB, rA0);
216
rC02 = _mm256_sub_pd(rC02, rB); rA0 = _mm256_load_pd(pA0+4);
218
rB = _mm256_unpackhi_pd(rB20, rB20);
219
rB = _mm256_mul_pd(rB, rA1);
220
rC00 = _mm256_sub_pd(rC00, rB); _mm256_store_pd(pC0, rC00);
221
rB = _mm256_unpackhi_pd(rB21, rB21);
222
rB = _mm256_mul_pd(rB, rA1);
223
rC01 = _mm256_sub_pd(rC01, rB); _mm256_store_pd(pC1, rC01);
224
rB = _mm256_unpackhi_pd(rB22, rB22);
225
rB = _mm256_mul_pd(rB, rA1);
226
rC02 = _mm256_sub_pd(rC02, rB); rA1 = _mm256_load_pd(pA1+4);
230
rB = _mm256_unpacklo_pd(rB00, rB00);
231
rB = _mm256_mul_pd(rB, rA0);
232
rC40 = _mm256_sub_pd(rC40, rB); _mm256_store_pd(pC2, rC02);
233
rB = _mm256_unpacklo_pd(rB01, rB01);
234
rB = _mm256_mul_pd(rB, rA0);
235
rC41 = _mm256_sub_pd(rC41, rB);
236
rB = _mm256_unpacklo_pd(rB02, rB02);
237
rB = _mm256_mul_pd(rB, rA0);
238
rC42 = _mm256_sub_pd(rC42, rB); rA0 = _mm256_load_pd(pA2+4);
240
rB = _mm256_unpackhi_pd(rB00, rB00);
241
rB = _mm256_mul_pd(rB, rA1);
242
rC40 = _mm256_sub_pd(rC40, rB);
243
rB = _mm256_unpackhi_pd(rB01, rB01);
244
rB = _mm256_mul_pd(rB, rA1);
245
rC41 = _mm256_sub_pd(rC41, rB);
246
rB = _mm256_unpackhi_pd(rB02, rB02);
247
rB = _mm256_mul_pd(rB, rA1);
248
rC42 = _mm256_sub_pd(rC42, rB); rA1 = _mm256_load_pd(pA3+4);
250
rB = _mm256_unpacklo_pd(rB20, rB20);
251
rB = _mm256_mul_pd(rB, rA0);
252
rC40 = _mm256_sub_pd(rC40, rB);
253
rB = _mm256_unpacklo_pd(rB21, rB21);
254
rB = _mm256_mul_pd(rB, rA0);
255
rC41 = _mm256_sub_pd(rC41, rB);
256
rB = _mm256_unpacklo_pd(rB22, rB22);
257
rB = _mm256_mul_pd(rB, rA0);
258
rC42 = _mm256_sub_pd(rC42, rB);
260
rB = _mm256_unpackhi_pd(rB20, rB20);
261
rB = _mm256_mul_pd(rB, rA1);
262
rC40 = _mm256_sub_pd(rC40, rB); _mm256_store_pd(pC0+4, rC40);
263
rB = _mm256_unpackhi_pd(rB21, rB21);
264
rB = _mm256_mul_pd(rB, rA1);
265
rC41 = _mm256_sub_pd(rC41, rB); _mm256_store_pd(pC1+4, rC41);
266
rB = _mm256_unpackhi_pd(rB22, rB22);
267
rB = _mm256_mul_pd(rB, rA1);
268
rC42 = _mm256_sub_pd(rC42, rB); _mm256_store_pd(pC2+4, rC42);
271
#elif defined(ATL_SSE2) && defined(DREAL)
274
#include <xmmintrin.h>
276
* Subtract off x0...x3 contribution to all remaining equations using a
277
* rank-4 update with mu=4, nu=3, ku=4. This version is for 16 SSE2 regs.
278
* nu is the # of RHS, ku is the number of equations solved, and mu is
279
* unrolled only to enable software pipelinine of load/use.
280
* Loop order is MKN, so that B is kept completely in registers, and
281
* C and A are streamed in (and out, for C) from cache during the operation.
283
ATL_SINLINE void ATL_rk4(ATL_CINT M, const TYPE *A, ATL_CINT lda,
284
TYPE *pB0, ATL_CINT ldb, TYPE *C, ATL_CINT ldc)
286
const TYPE *pA0 = A, *pA1 = A+lda,
287
*pA2 = A+((lda)<<1), *pA3=pA1+((lda)<<1);
288
TYPE *pC0 = C, *pC1 = C+ldc, *pC2 = C+((ldc)<<1);
291
register __m128d rB00, rB01, rB02;
292
register __m128d rB20, rB21, rB22;
293
register __m128d rC00, rC01, rC02;
294
register __m128d rC20, rC21, rC22;
295
register __m128d rA0, rA1;
299
rB00 = _mm_load_pd(pB0);
300
rB20 = _mm_load_pd(pB0+2);
301
rB01 = _mm_load_pd(pB0+ldb);
302
rB21 = _mm_load_pd(pB0+ldb+2);
303
rB02 = _mm_load_pd(pB0+2*ldb);
304
rB22 = _mm_load_pd(pB0+2*ldb+2);
306
rC00 = _mm_load_pd(pC0);
307
rC01 = _mm_load_pd(pC1);
308
rC02 = _mm_load_pd(pC2);
309
rA0 = _mm_load_pd(pA0); /* A1, A0 */
310
for (i=0; i < MM; i += 4, pA0 += 4, pA1 += 4, pA2 += 4, pA3 += 4,
311
pC0 += 4, pC1 += 4, pC2 += 4)
315
rB = _mm_unpacklo_pd(rB00, rB00);
316
rB = _mm_mul_pd(rB, rA0);
317
rC00 = _mm_sub_pd(rC00, rB); rA1 = _mm_load_pd(pA1);
318
rB = _mm_unpacklo_pd(rB01, rB01);
319
rB = _mm_mul_pd(rB, rA0);
320
rC01 = _mm_sub_pd(rC01, rB); rC20 =_mm_load_pd(pC0+2);
321
rB = _mm_unpacklo_pd(rB02, rB02);
322
rB = _mm_mul_pd(rB, rA0);
323
rC02 = _mm_sub_pd(rC02, rB); rA0 = _mm_load_pd(pA2);
325
rB = _mm_unpackhi_pd(rB00, rB00);
326
rB = _mm_mul_pd(rB, rA1);
327
rC00 = _mm_sub_pd(rC00, rB); rC21 =_mm_load_pd(pC1+2);
328
rB = _mm_unpackhi_pd(rB01, rB01);
329
rB = _mm_mul_pd(rB, rA1);
330
rC01 = _mm_sub_pd(rC01, rB); rC22 =_mm_load_pd(pC2+2);
331
rB = _mm_unpackhi_pd(rB02, rB02);
332
rB = _mm_mul_pd(rB, rA1);
333
rC02 = _mm_sub_pd(rC02, rB); rA1 = _mm_load_pd(pA3);
335
rB = _mm_unpacklo_pd(rB20, rB20);
336
rB = _mm_mul_pd(rB, rA0);
337
rC00 = _mm_sub_pd(rC00, rB);
338
rB = _mm_unpacklo_pd(rB21, rB21);
339
rB = _mm_mul_pd(rB, rA0);
340
rC01 = _mm_sub_pd(rC01, rB);
341
rB = _mm_unpacklo_pd(rB22, rB22);
342
rB = _mm_mul_pd(rB, rA0);
343
rC02 = _mm_sub_pd(rC02, rB); rA0 = _mm_load_pd(pA0+2);
345
rB = _mm_unpackhi_pd(rB20, rB20);
346
rB = _mm_mul_pd(rB, rA1);
347
rC00 = _mm_sub_pd(rC00, rB); _mm_store_pd(pC0, rC00);
348
rB = _mm_unpackhi_pd(rB21, rB21);
349
rB = _mm_mul_pd(rB, rA1);
350
rC01 = _mm_sub_pd(rC01, rB); _mm_store_pd(pC1, rC01);
351
rB = _mm_unpackhi_pd(rB22, rB22);
352
rB = _mm_mul_pd(rB, rA1);
353
rC02 = _mm_sub_pd(rC02, rB); rA1 = _mm_load_pd(pA1+2);
357
rB = _mm_unpacklo_pd(rB00, rB00);
358
rB = _mm_mul_pd(rB, rA0);
359
rC20 = _mm_sub_pd(rC20, rB); _mm_store_pd(pC2, rC02);
360
rB = _mm_unpacklo_pd(rB01, rB01);
361
rB = _mm_mul_pd(rB, rA0);
362
rC21 = _mm_sub_pd(rC21, rB); rC00 = _mm_load_pd(pC0+4);
363
rB = _mm_unpacklo_pd(rB02, rB02);
364
rB = _mm_mul_pd(rB, rA0);
365
rC22 = _mm_sub_pd(rC22, rB); rA0 = _mm_load_pd(pA2+2);
367
rB = _mm_unpackhi_pd(rB00, rB00);
368
rB = _mm_mul_pd(rB, rA1);
369
rC20 = _mm_sub_pd(rC20, rB); rC01 = _mm_load_pd(pC1+4);
370
rB = _mm_unpackhi_pd(rB01, rB01);
371
rB = _mm_mul_pd(rB, rA1);
372
rC21 = _mm_sub_pd(rC21, rB); rC02 = _mm_load_pd(pC2+4);
373
rB = _mm_unpackhi_pd(rB02, rB02);
374
rB = _mm_mul_pd(rB, rA1);
375
rC22 = _mm_sub_pd(rC22, rB); rA1 = _mm_load_pd(pA3+2);
377
rB = _mm_unpacklo_pd(rB20, rB20);
378
rB = _mm_mul_pd(rB, rA0);
379
rC20 = _mm_sub_pd(rC20, rB);
380
rB = _mm_unpacklo_pd(rB21, rB21);
381
rB = _mm_mul_pd(rB, rA0);
382
rC21 = _mm_sub_pd(rC21, rB);
383
rB = _mm_unpacklo_pd(rB22, rB22);
384
rB = _mm_mul_pd(rB, rA0);
385
rC22 = _mm_sub_pd(rC22, rB); rA0 = _mm_load_pd(pA0+4);
387
rB = _mm_unpackhi_pd(rB20, rB20);
388
rB = _mm_mul_pd(rB, rA1);
389
rC20 = _mm_sub_pd(rC20, rB); _mm_store_pd(pC0+2, rC20);
390
rB = _mm_unpackhi_pd(rB21, rB21);
391
rB = _mm_mul_pd(rB, rA1);
392
rC21 = _mm_sub_pd(rC21, rB); _mm_store_pd(pC1+2, rC21);
393
rB = _mm_unpackhi_pd(rB22, rB22);
394
rB = _mm_mul_pd(rB, rA1);
395
rC22 = _mm_sub_pd(rC22, rB); _mm_store_pd(pC2+2, rC22);
398
* Drain C load/use pipeline
403
rB = _mm_unpacklo_pd(rB00, rB00);
404
rB = _mm_mul_pd(rB, rA0);
405
rC00 = _mm_sub_pd(rC00, rB); rA1 = _mm_load_pd(pA1);
406
rB = _mm_unpacklo_pd(rB01, rB01);
407
rB = _mm_mul_pd(rB, rA0);
408
rC01 = _mm_sub_pd(rC01, rB); rC20 =_mm_load_pd(pC0+2);
409
rB = _mm_unpacklo_pd(rB02, rB02);
410
rB = _mm_mul_pd(rB, rA0);
411
rC02 = _mm_sub_pd(rC02, rB); rA0 = _mm_load_pd(pA2);
413
rB = _mm_unpackhi_pd(rB00, rB00);
414
rB = _mm_mul_pd(rB, rA1);
415
rC00 = _mm_sub_pd(rC00, rB); rC21 =_mm_load_pd(pC1+2);
416
rB = _mm_unpackhi_pd(rB01, rB01);
417
rB = _mm_mul_pd(rB, rA1);
418
rC01 = _mm_sub_pd(rC01, rB); rC22 =_mm_load_pd(pC2+2);
419
rB = _mm_unpackhi_pd(rB02, rB02);
420
rB = _mm_mul_pd(rB, rA1);
421
rC02 = _mm_sub_pd(rC02, rB); rA1 = _mm_load_pd(pA3);
423
rB = _mm_unpacklo_pd(rB20, rB20);
424
rB = _mm_mul_pd(rB, rA0);
425
rC00 = _mm_sub_pd(rC00, rB);
426
rB = _mm_unpacklo_pd(rB21, rB21);
427
rB = _mm_mul_pd(rB, rA0);
428
rC01 = _mm_sub_pd(rC01, rB);
429
rB = _mm_unpacklo_pd(rB22, rB22);
430
rB = _mm_mul_pd(rB, rA0);
431
rC02 = _mm_sub_pd(rC02, rB); rA0 = _mm_load_pd(pA0+2);
433
rB = _mm_unpackhi_pd(rB20, rB20);
434
rB = _mm_mul_pd(rB, rA1);
435
rC00 = _mm_sub_pd(rC00, rB); _mm_store_pd(pC0, rC00);
436
rB = _mm_unpackhi_pd(rB21, rB21);
437
rB = _mm_mul_pd(rB, rA1);
438
rC01 = _mm_sub_pd(rC01, rB); _mm_store_pd(pC1, rC01);
439
rB = _mm_unpackhi_pd(rB22, rB22);
440
rB = _mm_mul_pd(rB, rA1);
441
rC02 = _mm_sub_pd(rC02, rB); rA1 = _mm_load_pd(pA1+2);
445
rB = _mm_unpacklo_pd(rB00, rB00);
446
rB = _mm_mul_pd(rB, rA0);
447
rC20 = _mm_sub_pd(rC20, rB); _mm_store_pd(pC2, rC02);
448
rB = _mm_unpacklo_pd(rB01, rB01);
449
rB = _mm_mul_pd(rB, rA0);
450
rC21 = _mm_sub_pd(rC21, rB);
451
rB = _mm_unpacklo_pd(rB02, rB02);
452
rB = _mm_mul_pd(rB, rA0);
453
rC22 = _mm_sub_pd(rC22, rB); rA0 = _mm_load_pd(pA2+2);
455
rB = _mm_unpackhi_pd(rB00, rB00);
456
rB = _mm_mul_pd(rB, rA1);
457
rC20 = _mm_sub_pd(rC20, rB);
458
rB = _mm_unpackhi_pd(rB01, rB01);
459
rB = _mm_mul_pd(rB, rA1);
460
rC21 = _mm_sub_pd(rC21, rB);
461
rB = _mm_unpackhi_pd(rB02, rB02);
462
rB = _mm_mul_pd(rB, rA1);
463
rC22 = _mm_sub_pd(rC22, rB); rA1 = _mm_load_pd(pA3+2);
465
rB = _mm_unpacklo_pd(rB20, rB20);
466
rB = _mm_mul_pd(rB, rA0);
467
rC20 = _mm_sub_pd(rC20, rB);
468
rB = _mm_unpacklo_pd(rB21, rB21);
469
rB = _mm_mul_pd(rB, rA0);
470
rC21 = _mm_sub_pd(rC21, rB);
471
rB = _mm_unpacklo_pd(rB22, rB22);
472
rB = _mm_mul_pd(rB, rA0);
473
rC22 = _mm_sub_pd(rC22, rB);
475
rB = _mm_unpackhi_pd(rB20, rB20);
476
rB = _mm_mul_pd(rB, rA1);
477
rC20 = _mm_sub_pd(rC20, rB); _mm_store_pd(pC0+2, rC20);
478
rB = _mm_unpackhi_pd(rB21, rB21);
479
rB = _mm_mul_pd(rB, rA1);
480
rC21 = _mm_sub_pd(rC21, rB); _mm_store_pd(pC1+2, rC21);
481
rB = _mm_unpackhi_pd(rB22, rB22);
482
rB = _mm_mul_pd(rB, rA1);
483
rC22 = _mm_sub_pd(rC22, rB); _mm_store_pd(pC2+2, rC22);
486
#elif defined(ATL_SSE2) && defined(SREAL)
489
#include <xmmintrin.h>
491
* Subtract off x0...x3 contribution to all remaining equations using a
492
* rank-4 update with mu=8, nu=4, ku=4. This version is for 16 SSE regs.
493
* nu is the # of RHS, ku is the number of equations solved, and mu is
494
* unrolled only to enable vectorizations & software pipelinine of load/use.
495
* Code operates on any multiple of 4 despite using MU=8.
496
* Loop order is MKN, so that B is kept completely in registers, and
497
* C and A are streamed in (and out, for C) from cache during the operation.
499
ATL_SINLINE void ATL_rk4(ATL_CINT M, const TYPE *A, ATL_CINT lda,
500
TYPE *pB0, ATL_CINT ldb, TYPE *C, ATL_CINT ldc)
502
const TYPE *pA0 = A, *pA1 = A+lda,
503
*pA2 = A+((lda)<<1), *pA3=pA1+((lda)<<1);
504
TYPE *pC0 = C, *pC1 = C+ldc, *pC2 = C+((ldc)<<1), *pC3 = pC2+ldc;
505
ATL_CINT MM = (M & 4) ? M-4 : M-8;
507
register __m128 rB00, rB01, rB02, rB03;
508
register __m128 rC00, rC01, rC02, rC03;
509
register __m128 rC40, rC41, rC42, rC43;
510
register __m128 rA0, rA1;
514
rB00 = _mm_load_ps(pB0);
515
rB01 = _mm_load_ps(pB0+ldb);
516
rB02 = _mm_load_ps(pB0+(ldb<<1));
517
rB03 = _mm_load_ps(pB0+(ldb<<1)+ldb);
519
rC00 = _mm_load_ps(pC0);
520
rC01 = _mm_load_ps(pC1);
521
rC02 = _mm_load_ps(pC2);
522
rC03 = _mm_load_ps(pC3);
524
rA0 = _mm_load_ps(pA0);
526
for (i=0; i < MM; i += 8, pA0 += 8, pA1 += 8, pA2 += 8, pA3 += 8,
527
pC0 += 8, pC1 += 8, pC2 += 8, pC3 += 8)
533
rB = (__m128) _mm_shuffle_epi32((__m128i) rB00, 0x00);
534
rB = _mm_mul_ps(rB, rA0);
535
rC00 = _mm_sub_ps(rC00, rB); rA1 = _mm_load_ps(pA1);
536
rB = (__m128) _mm_shuffle_epi32((__m128i) rB01, 0x00);
537
rB = _mm_mul_ps(rB, rA0);
538
rC01 = _mm_sub_ps(rC01, rB); rC40 = _mm_load_ps(pC0+4);
539
rB = (__m128) _mm_shuffle_epi32((__m128i) rB02, 0x00);
540
rB = _mm_mul_ps(rB, rA0);
541
rC02 = _mm_sub_ps(rC02, rB); rC41 = _mm_load_ps(pC1+4);
542
rB = (__m128) _mm_shuffle_epi32((__m128i) rB03, 0x00);
543
rB = _mm_mul_ps(rB, rA0);
544
rC03 = _mm_sub_ps(rC03, rB); rC42 = _mm_load_ps(pC2+4);
548
rB = (__m128) _mm_shuffle_epi32((__m128i) rB00, 0x55);
549
rB = _mm_mul_ps(rB, rA1);
550
rC00 = _mm_sub_ps(rC00, rB); rA0 = _mm_load_ps(pA2);
551
rB = (__m128) _mm_shuffle_epi32((__m128i) rB01, 0x55);
552
rB = _mm_mul_ps(rB, rA1);
553
rC01 = _mm_sub_ps(rC01, rB); rC43 = _mm_load_ps(pC3+4);
554
rB = (__m128) _mm_shuffle_epi32((__m128i) rB02, 0x55);
555
rB = _mm_mul_ps(rB, rA1);
556
rC02 = _mm_sub_ps(rC02, rB);
557
rB = (__m128) _mm_shuffle_epi32((__m128i) rB03, 0x55);
558
rB = _mm_mul_ps(rB, rA1);
559
rC03 = _mm_sub_ps(rC03, rB); rA1 = _mm_load_ps(pA3);
563
rB = (__m128) _mm_shuffle_epi32((__m128i) rB00, 0xAA);
564
rB = _mm_mul_ps(rB, rA0);
565
rC00 = _mm_sub_ps(rC00, rB);
566
rB = (__m128) _mm_shuffle_epi32((__m128i) rB01, 0xAA);
567
rB = _mm_mul_ps(rB, rA0);
568
rC01 = _mm_sub_ps(rC01, rB);
569
rB = (__m128) _mm_shuffle_epi32((__m128i) rB02, 0xAA);
570
rB = _mm_mul_ps(rB, rA0);
571
rC02 = _mm_sub_ps(rC02, rB);
572
rB = (__m128) _mm_shuffle_epi32((__m128i) rB03, 0xAA);
573
rB = _mm_mul_ps(rB, rA0);
574
rC03 = _mm_sub_ps(rC03, rB); rA0 = _mm_load_ps(pA0+4);
578
rB = (__m128) _mm_shuffle_epi32((__m128i) rB00, 0xFF);
579
rB = _mm_mul_ps(rB, rA1);
580
rC00 = _mm_sub_ps(rC00, rB); _mm_store_ps(pC0, rC00);
581
rB = (__m128) _mm_shuffle_epi32((__m128i) rB01, 0xFF);
582
rB = _mm_mul_ps(rB, rA1);
583
rC01 = _mm_sub_ps(rC01, rB); _mm_store_ps(pC1, rC01);
584
rB = (__m128) _mm_shuffle_epi32((__m128i) rB02, 0xFF);
585
rB = _mm_mul_ps(rB, rA1);
586
rC02 = _mm_sub_ps(rC02, rB); _mm_store_ps(pC2, rC02);
587
rB = (__m128) _mm_shuffle_epi32((__m128i) rB03, 0xFF);
588
rB = _mm_mul_ps(rB, rA1);
589
rC03 = _mm_sub_ps(rC03, rB); _mm_store_ps(pC3, rC03);
594
rB = (__m128) _mm_shuffle_epi32((__m128i) rB00, 0x00);
595
rB = _mm_mul_ps(rB, rA0);
596
rC40 = _mm_sub_ps(rC40, rB); rA1 = _mm_load_ps(pA1+4);
597
rB = (__m128) _mm_shuffle_epi32((__m128i) rB01, 0x00);
598
rB = _mm_mul_ps(rB, rA0);
599
rC41 = _mm_sub_ps(rC41, rB); rC00 = _mm_load_ps(pC0+8);
600
rB = (__m128) _mm_shuffle_epi32((__m128i) rB02, 0x00);
601
rB = _mm_mul_ps(rB, rA0);
602
rC42 = _mm_sub_ps(rC42, rB); rC01 = _mm_load_ps(pC1+8);
603
rB = (__m128) _mm_shuffle_epi32((__m128i) rB03, 0x00);
604
rB = _mm_mul_ps(rB, rA0);
605
rC43 = _mm_sub_ps(rC43, rB); rC02 = _mm_load_ps(pC2+8);
609
rB = (__m128) _mm_shuffle_epi32((__m128i) rB00, 0x55);
610
rB = _mm_mul_ps(rB, rA1);
611
rC40 = _mm_sub_ps(rC40, rB); rA0 = _mm_load_ps(pA2+4);
612
rB = (__m128) _mm_shuffle_epi32((__m128i) rB01, 0x55);
613
rB = _mm_mul_ps(rB, rA1);
614
rC41 = _mm_sub_ps(rC41, rB); rC03 = _mm_load_ps(pC3+8);
615
rB = (__m128) _mm_shuffle_epi32((__m128i) rB02, 0x55);
616
rB = _mm_mul_ps(rB, rA1);
617
rC42 = _mm_sub_ps(rC42, rB);
618
rB = (__m128) _mm_shuffle_epi32((__m128i) rB03, 0x55);
619
rB = _mm_mul_ps(rB, rA1);
620
rC43 = _mm_sub_ps(rC43, rB); rA1 = _mm_load_ps(pA3+4);
624
rB = (__m128) _mm_shuffle_epi32((__m128i) rB00, 0xAA);
625
rB = _mm_mul_ps(rB, rA0);
626
rC40 = _mm_sub_ps(rC40, rB);
627
rB = (__m128) _mm_shuffle_epi32((__m128i) rB01, 0xAA);
628
rB = _mm_mul_ps(rB, rA0);
629
rC41 = _mm_sub_ps(rC41, rB);
630
rB = (__m128) _mm_shuffle_epi32((__m128i) rB02, 0xAA);
631
rB = _mm_mul_ps(rB, rA0);
632
rC42 = _mm_sub_ps(rC42, rB);
633
rB = (__m128) _mm_shuffle_epi32((__m128i) rB03, 0xAA);
634
rB = _mm_mul_ps(rB, rA0);
635
rC43 = _mm_sub_ps(rC43, rB); rA0 = _mm_load_ps(pA0+8);
639
rB = (__m128) _mm_shuffle_epi32((__m128i) rB00, 0xFF);
640
rB = _mm_mul_ps(rB, rA1);
641
rC40 = _mm_sub_ps(rC40, rB); _mm_store_ps(pC0+4, rC40);
642
rB = (__m128) _mm_shuffle_epi32((__m128i) rB01, 0xFF);
643
rB = _mm_mul_ps(rB, rA1);
644
rC41 = _mm_sub_ps(rC41, rB); _mm_store_ps(pC1+4, rC41);
645
rB = (__m128) _mm_shuffle_epi32((__m128i) rB02, 0xFF);
646
rB = _mm_mul_ps(rB, rA1);
647
rC42 = _mm_sub_ps(rC42, rB); _mm_store_ps(pC2+4, rC42);
648
rB = (__m128) _mm_shuffle_epi32((__m128i) rB03, 0xFF);
649
rB = _mm_mul_ps(rB, rA1);
650
rC43 = _mm_sub_ps(rC43, rB); _mm_store_ps(pC3+4, rC43);
653
* If orig M was multiple of 4 rather than 8, drain pipe over last 4 rows
661
rB = (__m128) _mm_shuffle_epi32((__m128i) rB00, 0x00);
662
rB = _mm_mul_ps(rB, rA0);
663
rC00 = _mm_sub_ps(rC00, rB); rA1 = _mm_load_ps(pA1);
664
rB = (__m128) _mm_shuffle_epi32((__m128i) rB01, 0x00);
665
rB = _mm_mul_ps(rB, rA0);
666
rC01 = _mm_sub_ps(rC01, rB);
667
rB = (__m128) _mm_shuffle_epi32((__m128i) rB02, 0x00);
668
rB = _mm_mul_ps(rB, rA0);
669
rC02 = _mm_sub_ps(rC02, rB);
670
rB = (__m128) _mm_shuffle_epi32((__m128i) rB03, 0x00);
671
rB = _mm_mul_ps(rB, rA0);
672
rC03 = _mm_sub_ps(rC03, rB); rA0 = _mm_load_ps(pA2);
676
rB = (__m128) _mm_shuffle_epi32((__m128i) rB00, 0x55);
677
rB = _mm_mul_ps(rB, rA1);
678
rC00 = _mm_sub_ps(rC00, rB);
679
rB = (__m128) _mm_shuffle_epi32((__m128i) rB01, 0x55);
680
rB = _mm_mul_ps(rB, rA1);
681
rC01 = _mm_sub_ps(rC01, rB);
682
rB = (__m128) _mm_shuffle_epi32((__m128i) rB02, 0x55);
683
rB = _mm_mul_ps(rB, rA1);
684
rC02 = _mm_sub_ps(rC02, rB);
685
rB = (__m128) _mm_shuffle_epi32((__m128i) rB03, 0x55);
686
rB = _mm_mul_ps(rB, rA1);
687
rC03 = _mm_sub_ps(rC03, rB); rA1 = _mm_load_ps(pA3);
691
rB = (__m128) _mm_shuffle_epi32((__m128i) rB00, 0xAA);
692
rB = _mm_mul_ps(rB, rA0);
693
rC00 = _mm_sub_ps(rC00, rB);
694
rB = (__m128) _mm_shuffle_epi32((__m128i) rB01, 0xAA);
695
rB = _mm_mul_ps(rB, rA0);
696
rC01 = _mm_sub_ps(rC01, rB);
697
rB = (__m128) _mm_shuffle_epi32((__m128i) rB02, 0xAA);
698
rB = _mm_mul_ps(rB, rA0);
699
rC02 = _mm_sub_ps(rC02, rB);
700
rB = (__m128) _mm_shuffle_epi32((__m128i) rB03, 0xAA);
701
rB = _mm_mul_ps(rB, rA0);
702
rC03 = _mm_sub_ps(rC03, rB);
706
rB = (__m128) _mm_shuffle_epi32((__m128i) rB00, 0xFF);
707
rB = _mm_mul_ps(rB, rA1);
708
rC00 = _mm_sub_ps(rC00, rB); _mm_store_ps(pC0, rC00);
709
rB = (__m128) _mm_shuffle_epi32((__m128i) rB01, 0xFF);
710
rB = _mm_mul_ps(rB, rA1);
711
rC01 = _mm_sub_ps(rC01, rB); _mm_store_ps(pC1, rC01);
712
rB = (__m128) _mm_shuffle_epi32((__m128i) rB02, 0xFF);
713
rB = _mm_mul_ps(rB, rA1);
714
rC02 = _mm_sub_ps(rC02, rB); _mm_store_ps(pC2, rC02);
715
rB = (__m128) _mm_shuffle_epi32((__m128i) rB03, 0xFF);
716
rB = _mm_mul_ps(rB, rA1);
717
rC03 = _mm_sub_ps(rC03, rB); _mm_store_ps(pC3, rC03);
719
else /* drain pipe with MU=8 */
725
rB = (__m128) _mm_shuffle_epi32((__m128i) rB00, 0x00);
726
rB = _mm_mul_ps(rB, rA0);
727
rC00 = _mm_sub_ps(rC00, rB); rA1 = _mm_load_ps(pA1);
728
rB = (__m128) _mm_shuffle_epi32((__m128i) rB01, 0x00);
729
rB = _mm_mul_ps(rB, rA0);
730
rC01 = _mm_sub_ps(rC01, rB); rC40 = _mm_load_ps(pC0+4);
731
rB = (__m128) _mm_shuffle_epi32((__m128i) rB02, 0x00);
732
rB = _mm_mul_ps(rB, rA0);
733
rC02 = _mm_sub_ps(rC02, rB); rC41 = _mm_load_ps(pC1+4);
734
rB = (__m128) _mm_shuffle_epi32((__m128i) rB03, 0x00);
735
rB = _mm_mul_ps(rB, rA0);
736
rC03 = _mm_sub_ps(rC03, rB); rC42 = _mm_load_ps(pC2+4);
740
rB = (__m128) _mm_shuffle_epi32((__m128i) rB00, 0x55);
741
rB = _mm_mul_ps(rB, rA1);
742
rC00 = _mm_sub_ps(rC00, rB); rA0 = _mm_load_ps(pA2);
743
rB = (__m128) _mm_shuffle_epi32((__m128i) rB01, 0x55);
744
rB = _mm_mul_ps(rB, rA1);
745
rC01 = _mm_sub_ps(rC01, rB); rC43 = _mm_load_ps(pC3+4);
746
rB = (__m128) _mm_shuffle_epi32((__m128i) rB02, 0x55);
747
rB = _mm_mul_ps(rB, rA1);
748
rC02 = _mm_sub_ps(rC02, rB);
749
rB = (__m128) _mm_shuffle_epi32((__m128i) rB03, 0x55);
750
rB = _mm_mul_ps(rB, rA1);
751
rC03 = _mm_sub_ps(rC03, rB); rA1 = _mm_load_ps(pA3);
755
rB = (__m128) _mm_shuffle_epi32((__m128i) rB00, 0xAA);
756
rB = _mm_mul_ps(rB, rA0);
757
rC00 = _mm_sub_ps(rC00, rB);
758
rB = (__m128) _mm_shuffle_epi32((__m128i) rB01, 0xAA);
759
rB = _mm_mul_ps(rB, rA0);
760
rC01 = _mm_sub_ps(rC01, rB);
761
rB = (__m128) _mm_shuffle_epi32((__m128i) rB02, 0xAA);
762
rB = _mm_mul_ps(rB, rA0);
763
rC02 = _mm_sub_ps(rC02, rB);
764
rB = (__m128) _mm_shuffle_epi32((__m128i) rB03, 0xAA);
765
rB = _mm_mul_ps(rB, rA0);
766
rC03 = _mm_sub_ps(rC03, rB); rA0 = _mm_load_ps(pA0+4);
770
rB = (__m128) _mm_shuffle_epi32((__m128i) rB00, 0xFF);
771
rB = _mm_mul_ps(rB, rA1);
772
rC00 = _mm_sub_ps(rC00, rB); _mm_store_ps(pC0, rC00);
773
rB = (__m128) _mm_shuffle_epi32((__m128i) rB01, 0xFF);
774
rB = _mm_mul_ps(rB, rA1);
775
rC01 = _mm_sub_ps(rC01, rB); _mm_store_ps(pC1, rC01);
776
rB = (__m128) _mm_shuffle_epi32((__m128i) rB02, 0xFF);
777
rB = _mm_mul_ps(rB, rA1);
778
rC02 = _mm_sub_ps(rC02, rB); _mm_store_ps(pC2, rC02);
779
rB = (__m128) _mm_shuffle_epi32((__m128i) rB03, 0xFF);
780
rB = _mm_mul_ps(rB, rA1);
781
rC03 = _mm_sub_ps(rC03, rB); _mm_store_ps(pC3, rC03);
786
rB = (__m128) _mm_shuffle_epi32((__m128i) rB00, 0x00);
787
rB = _mm_mul_ps(rB, rA0);
788
rC40 = _mm_sub_ps(rC40, rB); rA1 = _mm_load_ps(pA1+4);
789
rB = (__m128) _mm_shuffle_epi32((__m128i) rB01, 0x00);
790
rB = _mm_mul_ps(rB, rA0);
791
rC41 = _mm_sub_ps(rC41, rB);
792
rB = (__m128) _mm_shuffle_epi32((__m128i) rB02, 0x00);
793
rB = _mm_mul_ps(rB, rA0);
794
rC42 = _mm_sub_ps(rC42, rB);
795
rB = (__m128) _mm_shuffle_epi32((__m128i) rB03, 0x00);
796
rB = _mm_mul_ps(rB, rA0);
797
rC43 = _mm_sub_ps(rC43, rB);
801
rB = (__m128) _mm_shuffle_epi32((__m128i) rB00, 0x55);
802
rB = _mm_mul_ps(rB, rA1);
803
rC40 = _mm_sub_ps(rC40, rB); rA0 = _mm_load_ps(pA2+4);
804
rB = (__m128) _mm_shuffle_epi32((__m128i) rB01, 0x55);
805
rB = _mm_mul_ps(rB, rA1);
806
rC41 = _mm_sub_ps(rC41, rB);
807
rB = (__m128) _mm_shuffle_epi32((__m128i) rB02, 0x55);
808
rB = _mm_mul_ps(rB, rA1);
809
rC42 = _mm_sub_ps(rC42, rB);
810
rB = (__m128) _mm_shuffle_epi32((__m128i) rB03, 0x55);
811
rB = _mm_mul_ps(rB, rA1);
812
rC43 = _mm_sub_ps(rC43, rB); rA1 = _mm_load_ps(pA3+4);
816
rB = (__m128) _mm_shuffle_epi32((__m128i) rB00, 0xAA);
817
rB = _mm_mul_ps(rB, rA0);
818
rC40 = _mm_sub_ps(rC40, rB);
819
rB = (__m128) _mm_shuffle_epi32((__m128i) rB01, 0xAA);
820
rB = _mm_mul_ps(rB, rA0);
821
rC41 = _mm_sub_ps(rC41, rB);
822
rB = (__m128) _mm_shuffle_epi32((__m128i) rB02, 0xAA);
823
rB = _mm_mul_ps(rB, rA0);
824
rC42 = _mm_sub_ps(rC42, rB);
825
rB = (__m128) _mm_shuffle_epi32((__m128i) rB03, 0xAA);
826
rB = _mm_mul_ps(rB, rA0);
827
rC43 = _mm_sub_ps(rC43, rB);
831
rB = (__m128) _mm_shuffle_epi32((__m128i) rB00, 0xFF);
832
rB = _mm_mul_ps(rB, rA1);
833
rC40 = _mm_sub_ps(rC40, rB); _mm_store_ps(pC0+4, rC40);
834
rB = (__m128) _mm_shuffle_epi32((__m128i) rB01, 0xFF);
835
rB = _mm_mul_ps(rB, rA1);
836
rC41 = _mm_sub_ps(rC41, rB); _mm_store_ps(pC1+4, rC41);
837
rB = (__m128) _mm_shuffle_epi32((__m128i) rB02, 0xFF);
838
rB = _mm_mul_ps(rB, rA1);
839
rC42 = _mm_sub_ps(rC42, rB); _mm_store_ps(pC2+4, rC42);
840
rB = (__m128) _mm_shuffle_epi32((__m128i) rB03, 0xFF);
841
rB = _mm_mul_ps(rB, rA1);
842
rC43 = _mm_sub_ps(rC43, rB); _mm_store_ps(pC3+4, rC43);
849
* Subtract off x0...x3 contribution to all remaining equations using a
850
* rank-4 update with mu=2, nu=4, ku=4. This version is for 32 scalar
851
* registers, and assumes the scalar registers rB00..rB33 are live on entry.
852
* nu is the # of RHS, ku is the number of equations solved, and mu is
853
* unrolled only to enable software pipelinine of load/use.
854
* Loop order is MKN, so that B is kept completely in registers, and
855
* C and A are streamed in (and out, for C) from cache during the operation.
857
#define ATL_rk4(M_, A_, lda_, C_, ldc_) if (M_ > 1) \
859
const TYPE *pA0 = A_, *pA1 = A_+lda_, \
860
*pA2 = A_+((lda_)<<1), *pA3=pA1+((lda_)<<1); \
861
TYPE *pC0 = C_, *pC1 = C_+ldc_, \
862
*pC2 = C_+((ldc_)<<1), *pC3=pC1+((ldc_)<<1); \
863
register TYPE rC00= *pC0, rC01= *pC1, rC02 = *pC2, rC03 = *pC3; \
864
register TYPE rc00, rc01, rc02, rc03; \
865
register TYPE rA0 = *pA0, rA1; \
866
ATL_CINT MM = M_ - 2; \
869
for (i=0; i < MM; i += 2, pA0 += 2, pA1 += 2, pA2 += 2, pA3 += 2, \
870
pC0 += 2, pC1 += 2, pC2 += 2, pC3 += 2) \
872
rC00 -= rA0 * rB00; rA1 = *pA1; \
873
rC01 -= rA0 * rB01; rc00 = pC0[1]; \
874
rC02 -= rA0 * rB02; rc01 = pC1[1]; \
875
rC03 -= rA0 * rB03; rc02 = pC2[1]; \
877
rC00 -= rA1 * rB10; rA0 = *pA2; \
878
rC01 -= rA1 * rB11; rc03 = pC3[1]; \
879
rC02 -= rA1 * rB12; \
880
rC03 -= rA1 * rB13; \
882
rC00 -= rA0 * rB20; rA1 = *pA3; \
883
rC01 -= rA0 * rB21; \
884
rC02 -= rA0 * rB22; \
885
rC03 -= rA0 * rB23; rA0 = pA0[1]; \
887
rC00 -= rA1 * rB30; *pC0 = rC00; \
888
rC01 -= rA1 * rB31; *pC1 = rC01; \
889
rC02 -= rA1 * rB32; *pC2 = rC02; \
890
rC03 -= rA1 * rB33; *pC3 = rC03; \
892
rc00 -= rA0 * rB00; rA1 = pA1[1]; \
893
rc01 -= rA0 * rB01; rC00 = pC0[2]; \
894
rc02 -= rA0 * rB02; rC01 = pC1[2]; \
895
rc03 -= rA0 * rB03; rC02 = pC2[2]; \
897
rc00 -= rA1 * rB10; rA0 = pA2[1]; \
898
rc01 -= rA1 * rB11; rC03 = pC3[2]; \
899
rc02 -= rA1 * rB12; \
900
rc03 -= rA1 * rB13; \
902
rc00 -= rA0 * rB20; rA1 = pA3[1]; \
903
rc01 -= rA0 * rB21; \
904
rc02 -= rA0 * rB22; \
905
rc03 -= rA0 * rB23; rA0 = pA0[2]; \
907
rc00 -= rA1 * rB30; pC0[1] = rc00; \
908
rc01 -= rA1 * rB31; pC1[1] = rc01; \
909
rc02 -= rA1 * rB32; pC2[1] = rc02; \
910
rc03 -= rA1 * rB33; pC3[1] = rc03; \
913
* Drain the C fetch/store pipe \
915
rC00 -= rA0 * rB00; rA1 = *pA1; \
916
rC01 -= rA0 * rB01; rc00 = pC0[1]; \
917
rC02 -= rA0 * rB02; rc01 = pC1[1]; \
918
rC03 -= rA0 * rB03; rc02 = pC2[1]; \
920
rC00 -= rA1 * rB10; rA0 = *pA2; \
921
rC01 -= rA1 * rB11; rc03 = pC3[1]; \
922
rC02 -= rA1 * rB12; \
923
rC03 -= rA1 * rB13; \
925
rC00 -= rA0 * rB20; rA1 = *pA3; \
926
rC01 -= rA0 * rB21; \
927
rC02 -= rA0 * rB22; \
928
rC03 -= rA0 * rB23; rA0 = pA0[1]; \
930
rC00 -= rA1 * rB30; *pC0 = rC00; \
931
rC01 -= rA1 * rB31; *pC1 = rC01; \
932
rC02 -= rA1 * rB32; *pC2 = rC02; \
933
rC03 -= rA1 * rB33; *pC3 = rC03; \
935
rc00 -= rA0 * rB00; rA1 = pA1[1]; \
936
rc01 -= rA0 * rB01; \
937
rc02 -= rA0 * rB02; \
938
rc03 -= rA0 * rB03; \
940
rc00 -= rA1 * rB10; rA0 = pA2[1]; \
941
rc01 -= rA1 * rB11; \
942
rc02 -= rA1 * rB12; \
943
rc03 -= rA1 * rB13; \
945
rc00 -= rA0 * rB20; rA1 = pA3[1]; \
946
rc01 -= rA0 * rB21; \
947
rc02 -= rA0 * rB22; \
948
rc03 -= rA0 * rB23; \
950
rc00 -= rA1 * rB30; pC0[1] = rc00; \
951
rc01 -= rA1 * rB31; pC1[1] = rc01; \
952
rc02 -= rA1 * rB32; pC2[1] = rc02; \
953
rc03 -= rA1 * rB33; pC3[1] = rc03; \
959
* Solve 4x4 L with 3 RHS symbolically
960
* Answer is output into rBxx regs, which are live on input and output
962
#define ATL_trsmL4(L_, ldl_, r_, ldr_) \
964
const RTYPE L00=(*(L_)), L10=L_[1], L20=L_[2], L30=L_[3]; \
965
const RTYPE L11=L_[ldl_+1], L21=L_[ldl_+2], L31=a[ldl_+3]; \
966
const RTYPE L22=L_[2*(ldl_)+2], L32=L_[2*(ldl_)+3]; \
967
const RTYPE L33=L_[3*(ldl_)+3]; \
975
* x1 = (b1 - L10 * x0) / L11 \
977
rB10 = (rB10 - L10*rB00) * L11; \
978
rB11 = (rB11 - L10*rB01) * L11; \
979
rB12 = (rB12 - L10*rB02) * L11; \
980
ATL_pfl1W(r_ + ((ldr_)<<2)); \
982
* x2 = (b2 - L20*x0 - L21*x1) / L22 \
984
rB20 = (rB20 - L20*rB00 - L21*rB10) * L22; \
985
rB21 = (rB21 - L20*rB01 - L21*rB11) * L22; \
986
rB22 = (rB22 - L20*rB02 - L21*rB12) * L22; \
987
ATL_pfl1W(r_ + ldr_+((ldr_)<<2)); \
989
* x3 = (b3 - L30*x0 - L31*x1 - L32*x2) / L33 \
991
rB30 = (rB30 - L30*rB00 - L31*rB10 - L32*rB20) * L33; \
992
rB31 = (rB31 - L30*rB01 - L31*rB11 - L32*rB21) * L33; \
993
rB32 = (rB32 - L30*rB02 - L31*rB12 - L32*rB22) * L33; \
994
ATL_pfl1W(r_ + ((ldr_)<<1)+((ldr_)<<2)); \
995
} /* complete 4x4 NRHS=3 solve block */
997
#define ATL_trsmU4(U_, ldu_, r_, ldr_) \
999
const RTYPE U00=(*(U_)); \
1000
const RTYPE U01=(U_)[ldu_], U11=(U_)[ldu_+1]; \
1001
const RTYPE U02=(U_)[2*(ldu_)], U12= *(U_+2*(ldu_)+1), \
1002
U22 = *(U_+2*(ldu_)+2); \
1003
const RTYPE U03 = *(U_+3*(ldu_)), U13 = *(U_+3*(ldu_)+1), \
1004
U23 = *(U_+3*(ldu_)+2), U33 = *(U_+3*(ldu_)+3); \
1012
ATL_pfl1W(r_ + ((ldr_)<<2)); \
1014
* x2 = (b2 - U23 * x3) / U22 \
1016
rB20 = (rB20 - U23*rB30) * U22; \
1017
rB21 = (rB21 - U23*rB31) * U22; \
1018
rB22 = (rB22 - U23*rB32) * U22; \
1019
ATL_pfl1W(r_ + ldr_+((ldr_)<<2)); \
1021
* x1 = (b1 - U12*x2 - U13*x3) / U11 \
1023
rB10 = (rB10 - U12*rB20 - U13*rB30) * U11; \
1024
rB11 = (rB11 - U12*rB21 - U13*rB31) * U11; \
1025
rB12 = (rB12 - U12*rB22 - U13*rB32) * U11; \
1026
ATL_pfl1W(r_ + ((ldr_)<<1)+((ldr_)<<2)); \
1028
* x0 = (b0 - U01*x1 - U02*x2 - U03*x3) / U00 \
1030
rB00 = (rB00 - U01*rB10 - U02*rB20 - U03*rB30) * U00; \
1031
rB01 = (rB01 - U01*rB11 - U02*rB21 - U03*rB31) * U00; \
1032
ATL_pfl1W(r_ + ldr_+((ldr_)<<1)+((ldr_)<<2)); \
1033
rB02 = (rB02 - U01*rB12 - U02*rB22 - U03*rB32) * U00; \
1034
} /* complete M=4, N=3 solve block */
1037
* Solve 4x4 L with 4 RHS symbolically
1038
* Answer is output into rBxx regs, which are live on input and output
1040
#define ATL_trsmL4(L_, ldl_, r_, ldr_) \
1042
const RTYPE L00=(*(L_)), L10=L_[1], L20=L_[2], L30=L_[3]; \
1043
const RTYPE L11=L_[ldl_+1], L21=L_[ldl_+2], L31=a[ldl_+3]; \
1044
const RTYPE L22=L_[2*(ldl_)+2], L32=L_[2*(ldl_)+3]; \
1045
const RTYPE L33=L_[3*(ldl_)+3]; \
1053
ATL_pfl1W(r_ + ((ldr_)<<2)); \
1055
* x1 = (b1 - L10 * x0) / L11 \
1057
rB10 = (rB10 - L10*rB00) * L11; \
1058
rB11 = (rB11 - L10*rB01) * L11; \
1059
rB12 = (rB12 - L10*rB02) * L11; \
1060
rB13 = (rB13 - L10*rB03) * L11; \
1061
ATL_pfl1W(r_ + ldr_ +((ldr_)<<2)); \
1063
* x2 = (b2 - L20*x0 - L21*x1) / L22 \
1065
rB20 = (rB20 - L20*rB00 - L21*rB10) * L22; \
1066
rB21 = (rB21 - L20*rB01 - L21*rB11) * L22; \
1067
rB22 = (rB22 - L20*rB02 - L21*rB12) * L22; \
1068
rB23 = (rB23 - L20*rB03 - L21*rB13) * L22; \
1069
ATL_pfl1W(r_ + ((ldr_)<<1)+((ldr_)<<2)); \
1071
* x3 = (b3 - L30*x0 - L31*x1 - L32*x2) / L33 \
1073
rB30 = (rB30 - L30*rB00 - L31*rB10 - L32*rB20) * L33; \
1074
rB31 = (rB31 - L30*rB01 - L31*rB11 - L32*rB21) * L33; \
1075
ATL_pfl1W(r_ + ldr_+((ldr_)<<1)+((ldr_)<<2)); \
1076
rB32 = (rB32 - L30*rB02 - L31*rB12 - L32*rB22) * L33; \
1077
rB33 = (rB33 - L30*rB03 - L31*rB13 - L32*rB23) * L33; \
1078
} /* complete 4x4 solve block */
1080
#define ATL_trsmU4(U_, ldu_, r_, ldr_) \
1082
const RTYPE U00=(*(U_)); \
1083
const RTYPE U01=(U_)[ldu_], U11=(U_)[ldu_+1]; \
1084
const RTYPE U02=(U_)[2*(ldu_)], U12= *(U_+2*(ldu_)+1), \
1085
U22 = *(U_+2*(ldu_)+2); \
1086
const RTYPE U03 = *(U_+3*(ldu_)), U13 = *(U_+3*(ldu_)+1), \
1087
U23 = *(U_+3*(ldu_)+2), U33 = *(U_+3*(ldu_)+3); \
1096
ATL_pfl1W(r_ + ((ldr_)<<2)); \
1098
* x2 = (b2 - U23 * x3) / U22 \
1100
rB20 = (rB20 - U23*rB30) * U22; \
1101
rB21 = (rB21 - U23*rB31) * U22; \
1102
rB22 = (rB22 - U23*rB32) * U22; \
1103
rB23 = (rB23 - U23*rB33) * U22; \
1104
ATL_pfl1W(r_ + ldr_+((ldr_)<<2)); \
1106
* x1 = (b1 - U12*x2 - U13*x3) / U11 \
1108
rB10 = (rB10 - U12*rB20 - U13*rB30) * U11; \
1109
rB11 = (rB11 - U12*rB21 - U13*rB31) * U11; \
1110
rB12 = (rB12 - U12*rB22 - U13*rB32) * U11; \
1111
rB13 = (rB13 - U12*rB23 - U13*rB33) * U11; \
1112
ATL_pfl1W(r_ + ((ldr_)<<1)+((ldr_)<<2)); \
1114
* x0 = (b0 - U01*x1 - U02*x2 - U03*x3) / U00 \
1116
rB00 = (rB00 - U01*rB10 - U02*rB20 - U03*rB30) * U00; \
1117
rB01 = (rB01 - U01*rB11 - U02*rB21 - U03*rB31) * U00; \
1118
ATL_pfl1W(r_ + ldr_+((ldr_)<<1)+((ldr_)<<2)); \
1119
rB02 = (rB02 - U01*rB12 - U02*rB22 - U03*rB32) * U00; \
1120
rB03 = (rB03 - U01*rB13 - U02*rB23 - U03*rB33) * U00; \
1121
} /* complete 4x4 solve block */
1124
static void ATL_trsmLLN
1126
ATL_CINT M, /* size of orig triangular matrix A */
1127
ATL_CINT N, /* number of RHS in B */
1128
const SCALAR alpha, /* scale factor for B */
1129
const TYPE *A, /* MxM lower matrix A, diag has inverse of original diag */
1130
TYPE *B, /* on input, B, on output X, of A x = b */
1131
ATL_CINT ldb, /* leading dim of B */
1132
TYPE *W /* Mx4 workspace with good alignment */
1136
ATL_CINT M4 = ((M+3)>>2)<<2;
1140
* Loop over RHS, NRHS RHS at a time
1142
for (j=0; j < N; j += NRHS, B += NRHS*ldb)
1144
const int nb = Mmin(NRHS, N-j);
1146
TYPE *w = W, *b = B;
1149
* Copy NRHS RHS to aligned workspace and scale if necessary, alpha cannot be
1150
* zero, because this is handled as a special case at top
1152
for (k=0; k < nb; k++, w += M4, b += ldb)
1156
for (i=0; i < M; i++)
1157
w[i] = alpha * b[i];
1161
for (i=0; i < M; i++)
1167
for (; k < NRHS; k++, w += M4)
1168
for (i=0; i < M4; i++)
1171
* Completely solve these RHSs by looping over entire triangular matrix
1176
for (k=0; k < M; k += 4, b += 4, w += 4, a += (lda+1)<<2)
1178
ATL_CINT mr = Mmin(4,M-k);
1179
RTYPE rB00 = *w, rB10=w[1], rB20=w[2], rB30=w[3];
1180
RTYPE rB01=w[M4], rB11=w[M4+1], rB21=w[M4+2], rB31=w[M4+3];
1182
RTYPE rB02=w[2*M4],rB12=w[2*M4+1],rB22=w[2*M4+2],rB32=w[2*M4+3];
1185
RTYPE rB03=w[3*M4],rB13=w[3*M4+1],rB23=w[3*M4+2],rB33=w[3*M4+3];
1188
* Solve M=4 NRHS=4 TRSM symbolically
1190
ATL_trsmL4(a, lda, b, ldb);
1192
* Write solved 4x4 block out to original workspace (final answer)
1193
* Handle most common case with only one if
1195
if (mr == 4 && nb == NRHS)
1207
b[ldb+ldb+1] = rB12;
1208
b[ldb+ldb+2] = rB22;
1209
b[ldb+ldb+3] = rB32;
1212
b[(ldb<<1)+ldb] = rB03;
1213
b[(ldb<<1)+ldb+1] = rB13;
1214
b[(ldb<<1)+ldb+2] = rB23;
1215
b[(ldb<<1)+ldb+3] = rB33;
1250
b[ldb+ldb+3] = rB32;
1252
b[ldb+ldb+2] = rB22;
1254
b[ldb+ldb+1] = rB12;
1264
b[(ldb<<1)+ldb+3] = rB33;
1266
b[(ldb<<1)+ldb+2] = rB23;
1268
b[(ldb<<1)+ldb+1] = rB13;
1270
b[(ldb<<1)+ldb] = rB03;
1279
* Subtract off x0-x4 contribution from rest of B using rank-4 update
1304
ATL_rk4(M4-k-4, a+4, lda, w, M4, w+4, M4);
1307
ATL_rk4(M4-k-4, a+4, lda, w+4, M4);
1309
} /* end k-loop that loops through L */
1310
} /* end j-loop over RHS */
1314
static void ATL_trsmLUN
1316
ATL_CINT M, /* size of orig triangular matrix A */
1317
ATL_CINT N, /* number of RHS in B */
1318
const SCALAR alpha, /* scale factor for B */
1319
const TYPE *A, /* M4xM4 Upper matrix A, diag has inverse of original diag */
1320
TYPE *B, /* on input, B, on output X, of A x = b */
1321
ATL_CINT ldb, /* leading dim of B */
1322
TYPE *W /* M4x4 workspace with good alignment */
1326
ATL_CINT M4 = ((M+3)>>2)<<2, mr = M4-M;
1328
* Loop over RHS, NRHS RHS at a time
1330
for (j=0; j < N; j += NRHS, B += NRHS*ldb)
1332
const int nb = Mmin(NRHS, N-j);
1334
TYPE *w = W, *b = B;
1335
const TYPE *Ac = A + (M4-4)*M4, *a = Ac + M4-4;
1337
* Copy NRHS RHS to aligned workspace and scale if necessary, alpha cannot be
1338
* zero, because this is handled as a special case at top
1340
for (k=0; k < nb; k++, w += M4, b += ldb)
1342
for (i=0; i < mr; i++)
1347
w[i] = alpha * b[i-mr];
1355
for (; k < NRHS; k++, w += M4)
1356
for (i=0; i < M4; i++)
1359
* Completely solve these RHSs by looping over entire triangular matrix
1363
for (k=0; k < M; k += 4, w -= 4, a -= (M4+1)<<2, Ac -= M4<<2)
1365
ATL_CINT mr = Mmin(4,M-k);
1366
RTYPE rB00 = *w, rB10=w[1], rB20=w[2], rB30=w[3];
1367
RTYPE rB01=w[M4], rB11=w[M4+1], rB21=w[M4+2], rB31=w[M4+3];
1369
RTYPE rB02=w[2*M4],rB12=w[2*M4+1],rB22=w[2*M4+2],rB32=w[2*M4+3];
1372
RTYPE rB03=w[3*M4],rB13=w[3*M4+1],rB23=w[3*M4+2],rB33=w[3*M4+3];
1375
* Solve M=4 NRHS=4 TRSM symbolically
1378
ATL_trsmU4(a, M4, b, ldb);
1380
* Write solved 4x4 block out to original workspace (final answer)
1381
* Handle most common case with only one if
1383
if (mr == 4 && nb == NRHS)
1395
b[ldb+ldb+1] = rB12;
1396
b[ldb+ldb+2] = rB22;
1397
b[ldb+ldb+3] = rB32;
1400
b[(ldb<<1)+ldb] = rB03;
1401
b[(ldb<<1)+ldb+1] = rB13;
1402
b[(ldb<<1)+ldb+2] = rB23;
1403
b[(ldb<<1)+ldb+3] = rB33;
1462
b[ldb+ldb+1] = rB32;
1466
b[ldb+ldb+1] = rB22;
1467
b[ldb+ldb+2] = rB32;
1471
b[ldb+ldb+1] = rB12;
1472
b[ldb+ldb+2] = rB22;
1473
b[ldb+ldb+3] = rB32;
1479
ATL_CINT ldb3 = ldb+(ldb<<1);
1508
* Subtract off x0-x4 contribution from rest of B using rank-4 update
1533
ATL_rk4(M4-k-4, Ac, M4, w, M4, W, M4);
1536
ATL_rk4(M4-k-4, Ac, M4, W, M4);
1538
} /* end k-loop that loops through L */
1539
} /* end j-loop over RHS */
1542
ATL_SINLINE void trL2U
1543
(ATL_CINT N, const TYPE *L, ATL_CINT ldl, TYPE *U, ATL_CINT ldu)
1545
* reflects lower part of L into upper part of U
1551
for (j=0; j < N; j++, Lc += ldl)
1554
for (i=j; i < N; i++)
1559
ATL_SINLINE void trU2L
1560
(ATL_CINT N, const TYPE *U, ATL_CINT ldu, TYPE *L, ATL_CINT ldl)
1562
* reflects upper part of U into lower part of L
1568
for (j=0; j < N; j++, Uc += ldu)
1571
for (i=0; i <= j; i++, Lr += ldl)
1577
* Copy original U to aligned workspace, invert diagonal elts, pad wt I
1578
* Padding is at top of upper triangular matrix
1582
enum ATLAS_DIAG Diag,
1583
ATL_CINT N, /* size of triangular matrix A */
1584
const TYPE *A, /* lower triangular matrix */
1585
ATL_CINT lda, /* leading dim of A */
1586
TYPE *a, /* cpy of A, padded to N4 with I */
1587
ATL_CINT N4 /* leading dim of A */
1591
const int mr = N4-N;
1593
for (j=0; j < mr; j++, a += N4)
1595
for (i=0; i < N4; i++)
1599
for (; j < N4; j++, a += N4, A += lda)
1601
for (i=0; i < mr; i++)
1605
a[j] = (Diag == AtlasNonUnit) ? 1.0 / A[j-mr] : ATL_rone;
1610
* Copy original L to aligned workspace, invert diagonal elts, pad wt I
1614
enum ATLAS_DIAG Diag,
1615
ATL_CINT N, /* size of triangular matrix A */
1616
const TYPE *A, /* lower triangular matrix */
1617
ATL_CINT lda, /* leading dim of A */
1618
TYPE *a, /* cpy of A, padded to N4 with I */
1619
ATL_CINT N4 /* leading dim of A */
1625
for (j=0; j < N; j++, a += N4, A += lda)
1627
a[j] = (Diag == AtlasNonUnit) ? 1.0 / A[j] : ATL_rone;
1628
for (i=j+1; i < N; i++)
1633
for (; j < N4; j++, a += N4)
1635
for (i=0; i < N4; i++)
1640
int Mjoin(PATL,trsmKL_rk4) /* returns 0 on success */
1642
enum ATLAS_SIDE Side,
1643
enum ATLAS_UPLO Uplo,
1644
enum ATLAS_TRANS TA,
1645
enum ATLAS_DIAG Diag,
1646
ATL_CINT M, /* size of triangular matrix A */
1647
ATL_CINT N, /* number of RHS in B */
1648
const SCALAR alpha, /* scale factor for B */
1649
const TYPE *A0, /* MxM lower matrix A, diag has inverse of original diag */
1651
TYPE *B, /* on input, B, on output X, of A x = b */
1652
ATL_CINT ldb /* leading dim of B */
1657
TYPE *a, *w, *t=NULL;
1658
ATL_CINT M4 = ((M+3)>>2)<<2;
1660
int UPPER = (Uplo == AtlasUpper);
1662
if (TA == AtlasTrans)
1664
t = malloc(M*M*sizeof(TYPE));
1667
trU2L(M, A0, lda0, t, M);
1669
trL2U(M, A0, lda0, t, M);
1671
A = (const TYPE *) t;
1674
vp = malloc(sizeof(TYPE)*(M4*M4+M4*NRHS)+2*ATL_Cachelen);
1678
a = ATL_AlignPtr(vp);
1680
w = ATL_AlignPtr(w);
1685
cpypadL(Diag, M, A, lda, a, M4);
1688
ATL_trsmLLN(M, N, alpha, a, B, ldb, w);
1690
else /* Uplo == AtlasUpper */
1693
const int mr = M4-M;
1695
cpypadU(Diag, M, A, lda, a, M4);
1698
ATL_trsmLUN(M, N, alpha, a, B, ldb, w);