2
* Automatically Tuned Linear Algebra Software v3.2
3
* (C) Copyright 1999 Camm Maguire
5
* Redistribution and use in source and binary forms, with or without
6
* modification, are permitted provided that the following conditions
8
* 1. Redistributions of source code must retain the above copyright
9
* notice, this list of conditions and the following disclaimer.
10
* 2. Redistributions in binary form must reproduce the above copyright
11
* notice, this list of conditions, and the following disclaimer in the
12
* documentation and/or other materials provided with the distribution.
13
* 3. The name of the University of Tennessee, the ATLAS group,
14
* or the names of its contributers may not be used to endorse
15
* or promote products derived from this software without specific
18
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
20
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
21
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OR CONTRIBUTORS BE
22
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28
* POSSIBILITY OF SUCH DAMAGE.
36
#define str(a_) xstr(a_)
41
#define PREFA str(PREFN)
42
#define PREFA2 str(PREFN2)
43
#define VOLATILE __volatile__
45
#define la __asm__ VOLATILE ("movl %esi,%eax\n\taddl $16,%esi\n\t")
46
#define lb __asm__ VOLATILE ("movl %eax,%ebx\n\taddl $" PREFA ",%ebx\n\t")
47
#define prefetch __asm__ VOLATILE ("prefetcht0 (%ebx)\n\t")
48
#define prefetcha __asm__ VOLATILE ("prefetcht0 (%eax)\n\t")
49
#define loadc __asm__ VOLATILE ("movups (%eax),%xmm7\n\t")
50
#define movc __asm__ VOLATILE ("movups %xmm7,%xmm5\n\t")
52
#define loadb __asm__ VOLATILE ("movups (%eax),%xmm6\n\t")
53
#define writeb __asm__ VOLATILE ("movups %xmm6,(%eax)\n\t")
55
#define loadb __asm__ VOLATILE ("movaps (%eax),%xmm6\n\t")
56
#define writeb __asm__ VOLATILE ("movntps %xmm6,(%eax)\n\t")
58
#define xor(a_) __asm__ VOLATILE ("xorps %xmm" #a_ ",%xmm" #a_ "\n\t")
59
#define mul(a_) __asm__ VOLATILE ("mulps %xmm" #a_ ",%xmm5\n\t")
60
#define add __asm__ VOLATILE ("addps %xmm5,%xmm6\n\t")
61
#define incx(a_,b_) __asm__ VOLATILE ("addl %e" #a_ "x,%e" #b_ "x\n\t")
63
#define dp(a_,b_) {incx(a_,a);loadb;movc;mul(b_);add;writeb;}
64
#define pref(a_) {incx(a_,b);prefetch;}
65
#define prefa(a_) {incx(a_,a);prefetcha;}
66
#define dpp(a_,b_,c_) {incx(a_,a);loadb;movc;pref(c_);mul(b_);add;writeb;}
68
#define bla1 {la;lb;prefetch;loadc;dp(d,0);}
69
#define bla2 {bla1;dpp(c,1,d);}
70
#define bla3 {bla2;dp(c,2);}
71
#define bla4 {bla3;dpp(c,3,c);}
72
#define bla5 {bla4;dp(c,4);}
74
#define blb1 {la;pref(d);loadc;dp(d,0);}
75
#define blb2 {la;loadc;dpp(d,0,c);dp(c,1);}
76
#define blb3 {la;pref(c);loadc;dp(d,0);dpp(c,1,c);dp(c,2);}
77
#define blb4 {blb2;dpp(c,2,c);dp(c,3);}
78
#define blb5 {blb3;dpp(c,3,c);dp(c,4);}
81
#define DOT_PROD4 {Mjoin(bla,NDP);}
84
#define DOT_PROD8 {DOT_PROD4;Mjoin(blb,NDP);}
87
#define DOT_PROD16 {DOT_PROD8;DOT_PROD8;}
89
#define LOOP __asm__ VOLATILE ("\nloop_" str(EXT) ":\n\t"\
91
"jne block16_" str(EXT) "\n\t"\
92
"jmp block8_" str(EXT) "\n\t"\
94
"\nblock16_" str(EXT) ":\n\t")\
96
#define LAB8 __asm__ VOLATILE ("jmp loop_" str(EXT) "\n\t"\
98
"block8_" str(EXT) ":\n\t"\
100
"testl $8,%%edi\n\t"\
101
"je block4_" str(EXT) "\n\t"\
103
#define LAB4 __asm__ VOLATILE ("\nblock4_" str(EXT) ":\n\t"\
105
"je block1_" str(EXT) "\n\t")
107
#define LAB1(a_) __asm__ VOLATILE ("\nblock1_" str(EXT) ":\n\tmovl %%esi,%0\n\t" \
108
: "=m" (a_) : : "si" )
110
#define load_regs(a_,b_,c_,d_) \
111
__asm__ VOLATILE ("movl %0,%%esi\n\t"\
112
"movl %%esi,%%eax\n\t"\
113
"movl %%eax,%%ebx\n\t"\
114
"addl $" PREFA2 ",%%ebx\n\t"\
118
: : "m" (a_),"m" (b_),"m" (c_),"m" (d_) : \
119
"ax","bx","cx","dx","si","di")
121
#define ipref(a_) {prefa(a_);pref(a_);}
123
#define ir1 {prefetcha;prefetch;ipref(d);}
124
#define ir2 {ir1;ipref(c);}
125
#define ir3 {ir2;ipref(c);}
126
#define ir4 {ir3;ipref(c);}
127
#define ir5 {ir4;ipref(c);}
129
#define init_regs Mjoin(ir,NDP)
131
#define init_preload(a_,b_) __asm__ VOLATILE (\
132
"movl %0,%%ecx\n\tmovl %1,%%edx\n\t"::"m" (a_),"m" (b_):"ax","bx")
134
#define inca __asm__ VOLATILE ("addl %edx,%ecx\n\t");
136
#define preload_reg(a_) __asm__ VOLATILE (\
137
"movss (%ecx),%xmm7\n\tmovups %xmm7,%xmm" #a_ "\n\tshufps $0,%xmm7,%xmm" #a_ "\n\t")
139
#define pr1 {preload_reg(0);}
140
#define pr2 {pr1;inca;preload_reg(1);}
141
#define pr3 {pr2;inca;preload_reg(2);}
142
#define pr4 {pr3;inca;preload_reg(3);}
143
#define pr5 {pr4;inca;preload_reg(4);}
145
#define preload_regs(a_,b_) {init_preload(a_,b_);Mjoin(pr,NDP);}
149
Mjoin(g,EXT)(const float *a,int ainc,float *b,int ldb,const float *c,int len) {
151
const float *ce,*ce16,*ba;
156
c2b=(b-c)*sizeof(*c);
162
ba=(const float *)(((unsigned int)(b+3)>>4)<<4);
163
for (;c<ce && b<ba;c++,b++) {
164
for (ta=a,tb=b,i=0;i<NDP;i++,ta+=ainc,tb+=ldb)
169
ce16 = c + ((len>>4)<<4);
173
load_regs(c,ce16,c2b,b2b);
191
for (;c < ce;c++,b++) {
192
for (ta=a,tb=b,i=0;i<NDP;i++,ta+=ainc,tb+=ldb)