2
* Automatically Tuned Linear Algebra Software v3.10.1
3
* Copyright (C) 2009 R. Clint Whaley
5
* Redistribution and use in source and binary forms, with or without
6
* modification, are permitted provided that the following conditions
8
* 1. Redistributions of source code must retain the above copyright
9
* notice, this list of conditions and the following disclaimer.
10
* 2. Redistributions in binary form must reproduce the above copyright
11
* notice, this list of conditions, and the following disclaimer in the
12
* documentation and/or other materials provided with the distribution.
13
* 3. The name of the ATLAS group or the names of its contributers may
14
* not be used to endorse or promote products derived from this
15
* software without specific written permission.
17
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
19
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS
21
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27
* POSSIBILITY OF SUCH DAMAGE.
30
#include "atlas_misc.h"
31
#include "atlas_level1.h"
33
#define ATL_MulByNB(n_) ((n_)<<5)
34
#define ATL_DivByNB(n_) ((n_)>>5)
36
static void Mjoin(PATL,sqtrans0)(ATL_CINT N, TYPE *C, ATL_CINT ldc)
38
* Does an in-place transpose of a square matrix.
39
* NOTE: this should only be used on small matrices, as it is not optimized
45
* We will work by reflecting swapping columns & rows across diagonal,
46
* starting from the last column, so that early cols are retained in cache
49
Mjoin(PATL,swap)(j, C+((size_t)ldc)*(j SHIFT), 1, C+(j SHIFT), ldc);
52
void Mjoin(PATL,sqtrans)(ATL_CINT N, TYPE *C, ATL_CINT ldc)
54
* Does an in-place transpose of a square matrix. This routine is blocked
58
const size_t ldt = ldc;
59
ATL_CINT Nnb = ATL_MulByNB(ATL_DivByNB(N)), Nr = N - Nnb;
64
Mjoin(PATL,sqtrans0)(N, C, ldc);
68
* Loop in reverse order, so first part of matrix retained in cache
72
for (i=0; i < Nnb; i += NB)
73
Mjoin(PATL,geswapT)(NB, Nr, C+((Nnb*ldt+i)SHIFT), ldc,
74
C+((Nnb+i*ldt)SHIFT), ldc);
75
Mjoin(PATL,sqtrans0)(Nr, C+((Nnb*(ldt+1))SHIFT), ldc);
77
for (j=Nnb-NB; j >= 0; j -= NB)
80
for (i=0; i < j; i += NB)
81
Mjoin(PATL,geswapT)(NB, NB, C+((j*ldt+i)SHIFT), ldc,
82
C+((j+i*ldt)SHIFT), ldc);
83
Mjoin(PATL,sqtrans0)(NB, C+((j*(ldt+1))SHIFT), ldc);