1
// This file is part of Eigen, a lightweight C++ template library
4
// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
6
// Eigen is free software; you can redistribute it and/or
7
// modify it under the terms of the GNU Lesser General Public
8
// License as published by the Free Software Foundation; either
9
// version 3 of the License, or (at your option) any later version.
11
// Alternatively, you can redistribute it and/or
12
// modify it under the terms of the GNU General Public License as
13
// published by the Free Software Foundation; either version 2 of
14
// the License, or (at your option) any later version.
16
// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
17
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
18
// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
19
// GNU General Public License for more details.
21
// You should have received a copy of the GNU Lesser General Public
22
// License and a copy of the GNU General Public License along with
23
// Eigen. If not, see <http://www.gnu.org/licenses/>.
25
#ifndef EIGEN_SELFADJOINT_MATRIX_VECTOR_H
26
#define EIGEN_SELFADJOINT_MATRIX_VECTOR_H
30
/* Optimized selfadjoint matrix * vector product:
31
* This algorithm processes 2 columns at onces that allows to both reduce
32
* the number of load/stores of the result by a factor 2 and to reduce
33
* the instruction dependency.
35
template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs>
36
static EIGEN_DONT_INLINE void product_selfadjoint_vector(
38
const Scalar* lhs, Index lhsStride,
39
const Scalar* _rhs, Index rhsIncr,
43
typedef typename packet_traits<Scalar>::type Packet;
44
typedef typename NumTraits<Scalar>::Real RealScalar;
45
const Index PacketSize = sizeof(Packet)/sizeof(Scalar);
48
IsRowMajor = StorageOrder==RowMajor ? 1 : 0,
49
IsLower = UpLo == Lower ? 1 : 0,
50
FirstTriangular = IsRowMajor == IsLower
53
conj_helper<Scalar,Scalar,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, IsRowMajor), ConjugateRhs> cj0;
54
conj_helper<Scalar,Scalar,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, !IsRowMajor), ConjugateRhs> cj1;
55
conj_helper<Scalar,Scalar,NumTraits<Scalar>::IsComplex, ConjugateRhs> cjd;
57
conj_helper<Packet,Packet,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, IsRowMajor), ConjugateRhs> pcj0;
58
conj_helper<Packet,Packet,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, !IsRowMajor), ConjugateRhs> pcj1;
60
Scalar cjAlpha = ConjugateRhs ? conj(alpha) : alpha;
62
// FIXME this copy is now handled outside product_selfadjoint_vector, so it could probably be removed.
63
// if the rhs is not sequentially stored in memory we copy it to a temporary buffer,
64
// this is because we need to extract packets
65
ei_declare_aligned_stack_constructed_variable(Scalar,rhs,size,rhsIncr==1 ? const_cast<Scalar*>(_rhs) : 0);
68
const Scalar* it = _rhs;
69
for (Index i=0; i<size; ++i, it+=rhsIncr)
73
Index bound = std::max(Index(0),size-8) & 0xfffffffe;
77
for (Index j=FirstTriangular ? bound : 0;
78
j<(FirstTriangular ? size : bound);j+=2)
80
register const Scalar* EIGEN_RESTRICT A0 = lhs + j*lhsStride;
81
register const Scalar* EIGEN_RESTRICT A1 = lhs + (j+1)*lhsStride;
83
Scalar t0 = cjAlpha * rhs[j];
84
Packet ptmp0 = pset1<Packet>(t0);
85
Scalar t1 = cjAlpha * rhs[j+1];
86
Packet ptmp1 = pset1<Packet>(t1);
89
Packet ptmp2 = pset1<Packet>(t2);
91
Packet ptmp3 = pset1<Packet>(t3);
93
size_t starti = FirstTriangular ? 0 : j+2;
94
size_t endi = FirstTriangular ? j : size;
95
size_t alignedStart = (starti) + first_aligned(&res[starti], endi-starti);
96
size_t alignedEnd = alignedStart + ((endi-alignedStart)/(PacketSize))*(PacketSize);
98
// TODO make sure this product is a real * complex and that the rhs is properly conjugated if needed
99
res[j] += cjd.pmul(internal::real(A0[j]), t0);
100
res[j+1] += cjd.pmul(internal::real(A1[j+1]), t1);
103
res[j] += cj0.pmul(A1[j], t1);
104
t3 += cj1.pmul(A1[j], rhs[j]);
108
res[j+1] += cj0.pmul(A0[j+1],t0);
109
t2 += cj1.pmul(A0[j+1], rhs[j+1]);
112
for (size_t i=starti; i<alignedStart; ++i)
114
res[i] += t0 * A0[i] + t1 * A1[i];
115
t2 += conj(A0[i]) * rhs[i];
116
t3 += conj(A1[i]) * rhs[i];
118
// Yes this an optimization for gcc 4.3 and 4.4 (=> huge speed up)
119
// gcc 4.2 does this optimization automatically.
120
const Scalar* EIGEN_RESTRICT a0It = A0 + alignedStart;
121
const Scalar* EIGEN_RESTRICT a1It = A1 + alignedStart;
122
const Scalar* EIGEN_RESTRICT rhsIt = rhs + alignedStart;
123
Scalar* EIGEN_RESTRICT resIt = res + alignedStart;
124
for (size_t i=alignedStart; i<alignedEnd; i+=PacketSize)
126
Packet A0i = ploadu<Packet>(a0It); a0It += PacketSize;
127
Packet A1i = ploadu<Packet>(a1It); a1It += PacketSize;
128
Packet Bi = ploadu<Packet>(rhsIt); rhsIt += PacketSize; // FIXME should be aligned in most cases
129
Packet Xi = pload <Packet>(resIt);
131
Xi = pcj0.pmadd(A0i,ptmp0, pcj0.pmadd(A1i,ptmp1,Xi));
132
ptmp2 = pcj1.pmadd(A0i, Bi, ptmp2);
133
ptmp3 = pcj1.pmadd(A1i, Bi, ptmp3);
134
pstore(resIt,Xi); resIt += PacketSize;
136
for (size_t i=alignedEnd; i<endi; i++)
138
res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i],t1);
139
t2 += cj1.pmul(A0[i], rhs[i]);
140
t3 += cj1.pmul(A1[i], rhs[i]);
143
res[j] += alpha * (t2 + predux(ptmp2));
144
res[j+1] += alpha * (t3 + predux(ptmp3));
146
for (Index j=FirstTriangular ? 0 : bound;j<(FirstTriangular ? bound : size);j++)
148
register const Scalar* EIGEN_RESTRICT A0 = lhs + j*lhsStride;
150
Scalar t1 = cjAlpha * rhs[j];
152
// TODO make sure this product is a real * complex and that the rhs is properly conjugated if needed
153
res[j] += cjd.pmul(internal::real(A0[j]), t1);
154
for (Index i=FirstTriangular ? 0 : j+1; i<(FirstTriangular ? j : size); i++)
156
res[i] += cj0.pmul(A0[i], t1);
157
t2 += cj1.pmul(A0[i], rhs[i]);
159
res[j] += alpha * t2;
163
} // end namespace internal
165
/***************************************************************************
166
* Wrapper to product_selfadjoint_vector
167
***************************************************************************/
170
template<typename Lhs, int LhsMode, typename Rhs>
171
struct traits<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true> >
172
: traits<ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>, Lhs, Rhs> >
176
template<typename Lhs, int LhsMode, typename Rhs>
177
struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>
178
: public ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>, Lhs, Rhs >
180
EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix)
183
LhsUpLo = LhsMode&(Upper|Lower)
186
SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
188
template<typename Dest> void scaleAndAddTo(Dest& dest, Scalar alpha) const
190
typedef typename Dest::Scalar ResScalar;
191
typedef typename Base::RhsScalar RhsScalar;
192
typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest;
194
eigen_assert(dest.rows()==m_lhs.rows() && dest.cols()==m_rhs.cols());
196
const ActualLhsType lhs = LhsBlasTraits::extract(m_lhs);
197
const ActualRhsType rhs = RhsBlasTraits::extract(m_rhs);
199
Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs)
200
* RhsBlasTraits::extractScalarFactor(m_rhs);
203
EvalToDest = (Dest::InnerStrideAtCompileTime==1),
204
UseRhs = (_ActualRhsType::InnerStrideAtCompileTime==1)
207
internal::gemv_static_vector_if<ResScalar,Dest::SizeAtCompileTime,Dest::MaxSizeAtCompileTime,!EvalToDest> static_dest;
208
internal::gemv_static_vector_if<RhsScalar,_ActualRhsType::SizeAtCompileTime,_ActualRhsType::MaxSizeAtCompileTime,!UseRhs> static_rhs;
210
ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(),
211
EvalToDest ? dest.data() : static_dest.data());
213
ei_declare_aligned_stack_constructed_variable(RhsScalar,actualRhsPtr,rhs.size(),
214
UseRhs ? const_cast<RhsScalar*>(rhs.data()) : static_rhs.data());
218
#ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
219
int size = dest.size();
220
EIGEN_DENSE_STORAGE_CTOR_PLUGIN
222
MappedDest(actualDestPtr, dest.size()) = dest;
227
#ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
228
int size = rhs.size();
229
EIGEN_DENSE_STORAGE_CTOR_PLUGIN
231
Map<typename _ActualRhsType::PlainObject>(actualRhsPtr, rhs.size()) = rhs;
235
internal::product_selfadjoint_vector<Scalar, Index, (internal::traits<_ActualLhsType>::Flags&RowMajorBit) ? RowMajor : ColMajor, int(LhsUpLo), bool(LhsBlasTraits::NeedToConjugate), bool(RhsBlasTraits::NeedToConjugate)>
238
&lhs.coeffRef(0,0), lhs.outerStride(), // lhs info
239
actualRhsPtr, 1, // rhs info
240
actualDestPtr, // result info
241
actualAlpha // scale factor
245
dest = MappedDest(actualDestPtr, dest.size());
250
template<typename Lhs, typename Rhs, int RhsMode>
251
struct traits<SelfadjointProductMatrix<Lhs,0,true,Rhs,RhsMode,false> >
252
: traits<ProductBase<SelfadjointProductMatrix<Lhs,0,true,Rhs,RhsMode,false>, Lhs, Rhs> >
256
template<typename Lhs, typename Rhs, int RhsMode>
257
struct SelfadjointProductMatrix<Lhs,0,true,Rhs,RhsMode,false>
258
: public ProductBase<SelfadjointProductMatrix<Lhs,0,true,Rhs,RhsMode,false>, Lhs, Rhs >
260
EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix)
263
RhsUpLo = RhsMode&(Upper|Lower)
266
SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
268
template<typename Dest> void scaleAndAddTo(Dest& dest, Scalar alpha) const
270
// let's simply transpose the product
271
Transpose<Dest> destT(dest);
272
SelfadjointProductMatrix<Transpose<const Rhs>, int(RhsUpLo)==Upper ? Lower : Upper, false,
273
Transpose<const Lhs>, 0, true>(m_rhs.transpose(), m_lhs.transpose()).scaleAndAddTo(destT, alpha);
278
#endif // EIGEN_SELFADJOINT_MATRIX_VECTOR_H