2
* Automatically Tuned Linear Algebra Software v3.10.1
3
* Copyright (C) 2010 R. Clint Whaley
5
* Redistribution and use in source and binary forms, with or without
6
* modification, are permitted provided that the following conditions
8
* 1. Redistributions of source code must retain the above copyright
9
* notice, this list of conditions and the following disclaimer.
10
* 2. Redistributions in binary form must reproduce the above copyright
11
* notice, this list of conditions, and the following disclaimer in the
12
* documentation and/or other materials provided with the distribution.
13
* 3. The name of the ATLAS group or the names of its contributers may
14
* not be used to endorse or promote products derived from this
15
* software without specific written permission.
17
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
19
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS
21
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27
* POSSIBILITY OF SUCH DAMAGE.
31
#error "This routine requires SSE1!"
33
#include <xmmintrin.h>
35
#include "atlas_misc.h"
38
(ATL_CINT M, ATL_CINT N, const TYPE *X, const TYPE *Y,
39
TYPE *A, ATL_CINT lda1)
40
{/* BEGIN GER: nMU=1, MU=8, NU=4 */
42
ATL_CINT MAp = ( (((((size_t)A)+15)>>4)<<4) - ((size_t)A) )/sizeof(TYPE);
44
ATL_CINT M8=((MA/8)*8)+MAp, N4=((N/4)*4), lda2=lda1+lda1, lda3=lda2+lda1, lda4=lda3+lda1;
45
__m128 x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, a0_0, m0_0, a1_0, m1_0, a2_0, m2_0, a3_0, m3_0, a4_0, m4_0, a5_0, m5_0, a6_0, m6_0, a7_0, m7_0, a0_1, m0_1, a1_1, m1_1, a2_1, m2_1, a3_1, m3_1, a4_1, m4_1, a5_1, m5_1, a6_1, m6_1, a7_1, m7_1, a0_2, m0_2, a1_2, m1_2, a2_2, m2_2, a3_2, m3_2, a4_2, m4_2, a5_2, m5_2, a6_2, m6_2, a7_2, m7_2, a0_3, m0_3, a1_3, m1_3, a2_3, m2_3, a3_3, m3_3, a4_3, m4_3, a5_3, m5_3, a6_3, m6_3, a7_3, m7_3;
47
for (j=0; j < N4; j += 4, A += lda4, Y += 4)
48
{/* BEGIN N-LOOP UR=4 */
50
y1 = _mm_load1_ps(Y+1);
51
y2 = _mm_load1_ps(Y+2);
52
y3 = _mm_load1_ps(Y+3);
53
for (i=0; i < MAp; i++)
54
{/* peel to force X/A alignment */
55
x0 = _mm_load_ss(X+i+0);
56
a0_0 = _mm_load_ss(A+i+0);
57
m0_0 = _mm_mul_ss(x0, y0);
58
a0_0 = _mm_add_ss(a0_0, m0_0);
59
_mm_store_ss(A+i+0, a0_0);
60
a0_1 = _mm_load_ss(A+i+0+lda1);
61
m0_1 = _mm_mul_ss(x0, y1);
62
a0_1 = _mm_add_ss(a0_1, m0_1);
63
_mm_store_ss(A+i+0+lda1, a0_1);
64
a0_2 = _mm_load_ss(A+i+0+lda2);
65
m0_2 = _mm_mul_ss(x0, y2);
66
a0_2 = _mm_add_ss(a0_2, m0_2);
67
_mm_store_ss(A+i+0+lda2, a0_2);
68
a0_3 = _mm_load_ss(A+i+0+lda3);
69
m0_3 = _mm_mul_ss(x0, y3);
70
a0_3 = _mm_add_ss(a0_3, m0_3);
71
_mm_store_ss(A+i+0+lda3, a0_3);
72
} /* end force-align peel */
74
for (i=MAp; i < M8; i += 8)
75
{/* ----- BEGIN M-LOOP BODY ----- */
76
/* --- BEGIN MUxNU UNROLL 0 --- */
77
x0 = _mm_load_ps(X+i+0);
78
a0_0 = _mm_load_ps(A+i+0);
79
m0_0 = _mm_mul_ps(x0, y0);
80
a0_0 = _mm_add_ps(a0_0, m0_0);
81
_mm_store_ps(A+i+0, a0_0);
82
x4 = _mm_load_ps(X+i+4);
83
a4_0 = _mm_load_ps(A+i+4);
84
m4_0 = _mm_mul_ps(x4, y0);
85
a4_0 = _mm_add_ps(a4_0, m4_0);
86
_mm_store_ps(A+i+4, a4_0);
87
a0_1 = _mm_load_ps(A+i+0+lda1);
88
m0_1 = _mm_mul_ps(x0, y1);
89
a0_1 = _mm_add_ps(a0_1, m0_1);
90
_mm_store_ps(A+i+0+lda1, a0_1);
91
a4_1 = _mm_load_ps(A+i+4+lda1);
92
m4_1 = _mm_mul_ps(x4, y1);
93
a4_1 = _mm_add_ps(a4_1, m4_1);
94
_mm_store_ps(A+i+4+lda1, a4_1);
95
a0_2 = _mm_load_ps(A+i+0+lda2);
96
m0_2 = _mm_mul_ps(x0, y2);
97
a0_2 = _mm_add_ps(a0_2, m0_2);
98
_mm_store_ps(A+i+0+lda2, a0_2);
99
a4_2 = _mm_load_ps(A+i+4+lda2);
100
m4_2 = _mm_mul_ps(x4, y2);
101
a4_2 = _mm_add_ps(a4_2, m4_2);
102
_mm_store_ps(A+i+4+lda2, a4_2);
103
a0_3 = _mm_load_ps(A+i+0+lda3);
104
m0_3 = _mm_mul_ps(x0, y3);
105
a0_3 = _mm_add_ps(a0_3, m0_3);
106
_mm_store_ps(A+i+0+lda3, a0_3);
107
a4_3 = _mm_load_ps(A+i+4+lda3);
108
m4_3 = _mm_mul_ps(x4, y3);
109
a4_3 = _mm_add_ps(a4_3, m4_3);
110
_mm_store_ps(A+i+4+lda3, a4_3);
111
/* --- END MUxNU UNROLL 0 --- */
112
}/* ----- END M-LOOP BODY ----- */
114
{/* ----- BEGIN VECTOR UNROLL M CLEANUP ----- */
116
for (i=M8; i < M; i++)
117
{/* ----- BEGIN SCALAR M CLEANUP ----- */
118
x0 = _mm_load_ss(X+i+0);
119
a0_0 = _mm_load_ss(A+i+0);
120
m0_0 = _mm_mul_ss(x0, y0);
121
a0_0 = _mm_add_ss(a0_0, m0_0);
122
_mm_store_ss(A+i+0, a0_0);
123
a0_1 = _mm_load_ss(A+i+0+lda1);
124
m0_1 = _mm_mul_ss(x0, y1);
125
a0_1 = _mm_add_ss(a0_1, m0_1);
126
_mm_store_ss(A+i+0+lda1, a0_1);
127
a0_2 = _mm_load_ss(A+i+0+lda2);
128
m0_2 = _mm_mul_ss(x0, y2);
129
a0_2 = _mm_add_ss(a0_2, m0_2);
130
_mm_store_ss(A+i+0+lda2, a0_2);
131
a0_3 = _mm_load_ss(A+i+0+lda3);
132
m0_3 = _mm_mul_ss(x0, y3);
133
a0_3 = _mm_add_ss(a0_3, m0_3);
134
_mm_store_ss(A+i+0+lda3, a0_3);
135
}/* ----- END SCALAR M CLEANUP ----- */
136
}/* ----- END VECTOR UNROLL M CLEANUP ----- */
137
}/* END N-LOOP UR=4 */
139
for (j=N4; j < N; j += 1, A += lda1, Y++)
140
{/* BEGIN N-LOOP UR=1 */
141
y0 = _mm_load1_ps(Y);
142
for (i=0; i < MAp; i++)
143
{/* peel to force X/A alignment */
144
x0 = _mm_load_ss(X+i+0);
145
a0_0 = _mm_load_ss(A+i+0);
146
m0_0 = _mm_mul_ss(x0, y0);
147
a0_0 = _mm_add_ss(a0_0, m0_0);
148
_mm_store_ss(A+i+0, a0_0);
149
} /* end force-align peel */
151
for (i=MAp; i < M8; i += 8)
152
{/* ----- BEGIN M-LOOP BODY ----- */
153
/* --- BEGIN MUxNU UNROLL 0 --- */
154
x0 = _mm_load_ps(X+i+0);
155
a0_0 = _mm_load_ps(A+i+0);
156
m0_0 = _mm_mul_ps(x0, y0);
157
a0_0 = _mm_add_ps(a0_0, m0_0);
158
_mm_store_ps(A+i+0, a0_0);
159
x4 = _mm_load_ps(X+i+4);
160
a4_0 = _mm_load_ps(A+i+4);
161
m4_0 = _mm_mul_ps(x4, y0);
162
a4_0 = _mm_add_ps(a4_0, m4_0);
163
_mm_store_ps(A+i+4, a4_0);
164
/* --- END MUxNU UNROLL 0 --- */
165
}/* ----- END M-LOOP BODY ----- */
167
{/* ----- BEGIN VECTOR UNROLL M CLEANUP ----- */
168
for (i=M8; i < M; i++)
169
{/* ----- BEGIN SCALAR M CLEANUP ----- */
170
x0 = _mm_load_ss(X+i+0);
171
a0_0 = _mm_load_ss(A+i+0);
172
m0_0 = _mm_mul_ss(x0, y0);
173
a0_0 = _mm_add_ss(a0_0, m0_0);
174
_mm_store_ss(A+i+0, a0_0);
175
}/* ----- END SCALAR M CLEANUP ----- */
176
}/* ----- END VECTOR UNROLL M CLEANUP ----- */
177
}/* END N-LOOP UR=1 */
178
}/* END GER: nMU=1, MU=8, NU=4 */