1
/******************************************************************************
2
* Copyright © 2012-2014 Institut für Nachrichtentechnik, Universität Rostock *
3
* Copyright © 2006-2012 Quality & Usability Lab, *
4
* Telekom Innovation Laboratories, TU Berlin *
6
* This file is part of the Audio Processing Framework (APF). *
8
* The APF is free software: you can redistribute it and/or modify it under *
9
* the terms of the GNU General Public License as published by the Free *
10
* Software Foundation, either version 3 of the License, or (at your option) *
11
* any later version. *
13
* The APF is distributed in the hope that it will be useful, but WITHOUT ANY *
14
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS *
15
* FOR A PARTICULAR PURPOSE. *
16
* See the GNU General Public License for more details. *
18
* You should have received a copy of the GNU General Public License along *
19
* with this program. If not, see <http://www.gnu.org/licenses/>. *
21
* http://AudioProcessingFramework.github.com *
22
******************************************************************************/
25
/// Convolution engine.
27
#ifndef APF_CONVOLVER_H
28
#define APF_CONVOLVER_H
30
#include <algorithm> // for std::transform()
31
#include <functional> // for std::bind()
35
#include <xmmintrin.h> // for SSE instrinsics
39
#include "apf/fftwtools.h" // for fftw_allocator and fftw traits
40
#include "apf/container.h" // for fixed_vector, fixed_list
41
#include "apf/iterator.h" // for make_*_iterator()
46
/** Convolution engine.
47
* A convolution engine normally consists of an Input, a Filter and
48
* an Output / StaticOutput.
49
* There are also combinations:
50
* Input + Output = Convolver; Input + StaticOutput = StaticConvolver
52
* Uses (uniformly) partitioned convolution.
54
* TODO: describe thread (un)safety
59
/// Calculate necessary number of partitions for a given filter length
60
static size_t min_partitions(size_t block_size, size_t filter_size)
62
return (filter_size + block_size - 1) / block_size;
65
/// Two blocks of time-domain or FFT (half-complex) data.
66
struct fft_node : fixed_vector<float, fftw_allocator<float>>
68
explicit fft_node(size_t n)
69
: fixed_vector<float, fftw_allocator<float>>(n)
73
fft_node(const fft_node&) = delete;
74
fft_node(fft_node&&) = default;
76
fft_node& operator=(const fft_node& rhs)
78
assert(this->size() == rhs.size());
86
std::copy(rhs.begin(), rhs.end(), this->begin());
92
// WARNING: The 'zero' flag allows saving computation power, but it also
93
// raises the risk of programming errors! Handle with care!
95
/// To avoid unnecessary FFTs and filling buffers with zeros.
96
/// @note If zero == true, the buffer itself is not necessarily all zeros!
100
/// Container holding a number of FFT blocks.
101
struct Filter : fixed_vector<fft_node>
103
/// Constructor; create empty filter.
104
Filter(size_t block_size_, size_t partitions_)
105
: fixed_vector<fft_node>(partitions_, block_size_ * 2)
107
assert(this->partitions() > 0);
110
/// Constructor from time domain coefficients.
111
template<typename In>
112
Filter(size_t block_size_, In first, In last, size_t partitions_ = 0);
113
// Implementation below, after definition of Transform
115
size_t block_size() const { return this->front().size() / 2; }
116
size_t partition_size() const { return this->front().size(); }
117
size_t partitions() const { return this->size(); }
120
/// Forward-FFT-related functions
124
template<typename In>
125
void prepare_filter(In first, In last, Filter& filter) const;
127
size_t block_size() const { return _block_size; }
128
size_t partition_size() const { return _partition_size; }
130
template<typename In>
131
In prepare_partition(In first, In last, fft_node& partition) const;
134
explicit TransformBase(size_t block_size_);
136
TransformBase(TransformBase&&) = default;
137
~TransformBase() = default;
139
using scoped_plan = fftw<float>::scoped_plan;
140
using plan_ptr = std::unique_ptr<scoped_plan>;
142
plan_ptr _create_plan(float* array) const;
145
void _fft(float* first) const
147
fftw<float>::execute_r2r(*_fft_plan, first, first);
148
_sort_coefficients(first);
154
void _sort_coefficients(float* first) const;
156
const size_t _block_size;
157
const size_t _partition_size;
160
TransformBase::TransformBase(size_t block_size_)
161
: _block_size(block_size_)
162
, _partition_size(2 * _block_size)
164
if (_block_size % 8 != 0)
166
throw std::logic_error("Convolver: block size must be a multiple of 8!");
170
/** Create in-place FFT plan for halfcomplex data format.
171
* @note FFT plans are not re-entrant except when using FFTW_THREADSAFE!
172
* @note Once a plan of a certain size exists, creating further plans
173
* is very fast because "wisdom" is shared (and therefore the creation of
174
* plans is not thread-safe).
175
* It is not necessary to re-use plans in other convolver instances.
177
TransformBase::plan_ptr
178
TransformBase::_create_plan(float* array) const
180
return plan_ptr(new scoped_plan(fftw<float>::plan_r2r_1d, int(_partition_size)
181
, array, array, FFTW_R2HC, FFTW_PATIENT));
184
/** %Transform time-domain samples.
185
* If there are too few input samples, the rest is zero-padded, if there are
186
* too few blocks in the container @p c, the rest of the samples is ignored.
187
* @param first Iterator to first time-domain sample
188
* @param last Past-the-end iterator
189
* @param[out] filter Target container
191
template<typename In>
193
TransformBase::prepare_filter(In first, In last, Filter& filter) const
195
for (auto& partition: filter)
197
first = this->prepare_partition(first, last, partition);
201
/** FFT of one block.
202
* If there are too few coefficients, the rest is zero-padded.
203
* @param first Iterator to first coefficient
204
* @param last Past-the-end iterator
205
* @param[out] partition Target partition
206
* @tparam In Forward iterator
207
* @return Iterator to the first coefficient of the next block (for the next
208
* iteration, if needed)
210
template<typename In>
212
TransformBase::prepare_partition(In first, In last, fft_node& partition) const
214
assert(size_t(std::distance(partition.begin(), partition.end()))
217
auto chunk = std::min(_block_size, size_t(std::distance(first, last)));
221
partition.zero = true;
222
// No FFT has to be done (FFT of zero is also zero)
226
std::copy(first, first + chunk, partition.begin());
227
std::fill(partition.begin() + chunk, partition.end(), 0.0f); // zero padding
228
_fft(partition.data());
229
partition.zero = false;
231
return first + chunk;
234
/** Sort the FFT coefficients to be in proper place for the efficient
235
* multiplication of the spectra.
238
TransformBase::_sort_coefficients(float* data) const
240
auto buffer = fixed_vector<float>(_partition_size);
248
buffer[4] = data[_block_size];
249
buffer[5] = data[_partition_size - 1];
250
buffer[6] = data[_partition_size - 2];
251
buffer[7] = data[_partition_size - 3];
253
for (size_t i = 0; i < (_partition_size / 8-1); i++)
255
for (size_t ii = 0; ii < 4; ii++)
257
buffer[base+ii] = data[base/2+ii];
260
for (size_t ii = 0; ii < 4; ii++)
262
buffer[base+4+ii] = data[_partition_size-base/2-ii];
268
std::copy(buffer.begin(), buffer.end(), data);
271
/// Helper class to prepare filters
272
struct Transform : TransformBase
274
Transform(size_t block_size_)
275
: TransformBase(block_size_)
277
// Temporary memory area for FFTW planning routines
278
fft_node planning_space(this->partition_size());
279
_fft_plan = _create_plan(planning_space.data());
283
template<typename In>
284
Filter::Filter(size_t block_size_, In first, In last, size_t partitions_)
285
: fixed_vector<fft_node>(partitions_ ? partitions_
286
: min_partitions(block_size_, std::distance(first, last))
289
assert(this->partitions() > 0);
291
Transform(block_size_).prepare_filter(first, last, *this);
294
/** %Input stage of convolution.
295
* New audio data is fed in here, further processing happens in Output.
297
struct Input : TransformBase
299
/// @param block_size_ audio block size
300
/// @param partitions_ number of partitions
301
Input(size_t block_size_, size_t partitions_)
302
: TransformBase(block_size_)
303
// One additional list element for preparing the upcoming partition:
304
, spectra(partitions_ + 1, this->partition_size())
306
assert(partitions_ > 0);
308
_fft_plan = _create_plan(spectra.front().data());
311
template<typename In>
312
void add_block(In first);
314
size_t partitions() const { return spectra.size() - 1; }
316
/// Spectra of the partitions (double-blocks) of the input signal to be
317
/// convolved. The first element is the most recent signal chunk.
318
fixed_list<fft_node> spectra;
321
/** Add a block of time-domain input samples.
322
* @param first Iterator to first sample.
323
* @tparam In Forward iterator
325
template<typename In>
327
Input::add_block(In first)
330
std::advance(last, this->block_size());
332
// rotate buffers (this->spectra.size() is always at least 2)
333
this->spectra.move(--this->spectra.end(), this->spectra.begin());
335
auto& current = this->spectra.front();
336
auto& next = this->spectra.back();
338
if (math::has_only_zeros(first, last))
344
// Nothing to be done, actual data is ignored
348
// If first half is not zero, second half must be filled with zeros
349
std::fill(current.begin() + this->block_size(), current.end(), 0.0f);
356
// First half must be actually filled with zeros
357
std::fill(current.begin(), current.begin() + this->block_size(), 0.0f);
360
// Copy data to second half of the current partition
361
std::copy(first, last, current.begin() + this->block_size());
362
current.zero = false;
363
// Copy data to first half of the upcoming partition
364
std::copy(first, last, next.begin());
370
// Nothing to be done, FFT of zero is also zero
374
_fft(current.data());
378
/// Base class for Output and StaticOutput
382
float* convolve(float weight = 1.0f);
384
size_t block_size() const { return _input.block_size(); }
385
size_t partitions() const { return _filter_ptrs.size(); }
388
explicit OutputBase(const Input& input);
390
// This is non-const to allow automatic move-constructor:
391
fft_node _empty_partition;
393
using filter_ptrs_t = fixed_vector<const fft_node*>;
394
filter_ptrs_t _filter_ptrs;
397
void _multiply_spectra();
398
void _multiply_partition_cpp(const float* signal, const float* filter);
400
void _multiply_partition_simd(const float* signal, const float* filter);
403
void _unsort_coefficients();
409
const size_t _partition_size;
411
fft_node _output_buffer;
412
fftw<float>::scoped_plan _ifft_plan;
415
OutputBase::OutputBase(const Input& input)
416
: _empty_partition(0)
417
// Initialize with empty partition
418
, _filter_ptrs(input.partitions(), &_empty_partition)
420
, _partition_size(input.partition_size())
421
, _output_buffer(_partition_size)
422
, _ifft_plan(fftw<float>::plan_r2r_1d, int(_partition_size)
423
, _output_buffer.data()
424
, _output_buffer.data(), FFTW_HC2R, FFTW_PATIENT)
426
assert(_filter_ptrs.size() > 0);
429
/** Fast convolution of one audio block.
430
* %Input data has to be supplied with Input::add_block().
431
* @param weight amplitude weighting factor for current audio block.
432
* The filter has to be set in the constructor of StaticOutput or via
433
* Output::set_filter().
434
* @return pointer to the first sample of the convolved (and weighted) signal
437
OutputBase::convolve(float weight)
441
// The first half will be discarded
442
auto second_half = make_begin_and_end(
443
_output_buffer.begin() + _input.block_size(), _output_buffer.end());
445
assert(static_cast<size_t>(
446
std::distance(second_half.begin(), second_half.end()))
447
== _input.block_size());
449
if (_output_buffer.zero)
451
// Nothing to be done, IFFT of zero is also zero.
452
// _output_buffer was already reset to zero in _multiply_spectra().
458
// normalize buffer (fftw3 does not do this)
459
const auto norm = weight / float(_partition_size);
460
for (auto& x: second_half)
465
return &second_half[0];
469
OutputBase::_multiply_partition_cpp(const float* signal, const float* filter)
471
// see http://www.ludd.luth.se/~torger/brutefir.html#bruteconv_4
473
auto d1s = _output_buffer[0] + signal[0] * filter[0];
474
auto d2s = _output_buffer[4] + signal[4] * filter[4];
476
for (size_t nn = 0; nn < _partition_size; nn += 8)
479
_output_buffer[nn+0] += signal[nn+0] * filter[nn + 0] -
480
signal[nn+4] * filter[nn + 4];
481
_output_buffer[nn+1] += signal[nn+1] * filter[nn + 1] -
482
signal[nn+5] * filter[nn + 5];
483
_output_buffer[nn+2] += signal[nn+2] * filter[nn + 2] -
484
signal[nn+6] * filter[nn + 6];
485
_output_buffer[nn+3] += signal[nn+3] * filter[nn + 3] -
486
signal[nn+7] * filter[nn + 7];
489
_output_buffer[nn+4] += signal[nn+0] * filter[nn + 4] +
490
signal[nn+4] * filter[nn + 0];
491
_output_buffer[nn+5] += signal[nn+1] * filter[nn + 5] +
492
signal[nn+5] * filter[nn + 1];
493
_output_buffer[nn+6] += signal[nn+2] * filter[nn + 6] +
494
signal[nn+6] * filter[nn + 2];
495
_output_buffer[nn+7] += signal[nn+3] * filter[nn + 7] +
496
signal[nn+7] * filter[nn + 3];
500
_output_buffer[0] = d1s;
501
_output_buffer[4] = d2s;
506
OutputBase::_multiply_partition_simd(const float* signal, const float* filter)
508
// 16 byte alignment is needed for _mm_load_ps()!
509
// This should be the case anyway because fftwf_malloc() is used.
511
auto dc = _output_buffer[0] + signal[0] * filter[0];
512
auto ny = _output_buffer[4] + signal[4] * filter[4];
514
for(size_t i = 0; i < _partition_size; i += 8)
516
// load real and imaginary parts of signal and filter
517
__m128 sigr = _mm_load_ps(signal + i);
518
__m128 sigi = _mm_load_ps(signal + i + 4);
519
__m128 filtr = _mm_load_ps(filter + i);
520
__m128 filti = _mm_load_ps(filter + i + 4);
522
// multiply and subtract
523
__m128 res1 = _mm_sub_ps(_mm_mul_ps(sigr, filtr), _mm_mul_ps(sigi, filti));
526
__m128 res2 = _mm_add_ps(_mm_mul_ps(sigr, filti), _mm_mul_ps(sigi, filtr));
528
// load output data for accumulation
529
__m128 acc1 = _mm_load_ps(&_output_buffer[i]);
530
__m128 acc2 = _mm_load_ps(&_output_buffer[i + 4]);
533
acc1 = _mm_add_ps(acc1, res1);
534
acc2 = _mm_add_ps(acc2, res2);
537
_mm_store_ps(&_output_buffer[i], acc1);
538
_mm_store_ps(&_output_buffer[i + 4], acc2);
541
_output_buffer[0] = dc;
542
_output_buffer[4] = ny;
546
/// Complex multiplication of input and filter spectra
548
OutputBase::_multiply_spectra()
551
std::fill(_output_buffer.begin(), _output_buffer.end(), 0.0f);
552
_output_buffer.zero = true;
554
assert(_filter_ptrs.size() == _input.partitions());
556
auto input = _input.spectra.begin();
558
for (const auto filter: _filter_ptrs)
560
assert(filter != nullptr);
562
if (input->zero || filter->zero) continue;
565
_multiply_partition_simd(input->data(), filter->data());
567
_multiply_partition_cpp(input->data(), filter->data());
570
_output_buffer.zero = false;
576
OutputBase::_unsort_coefficients()
578
fixed_vector<float> buffer(_partition_size);
582
buffer[0] = _output_buffer[0];
583
buffer[1] = _output_buffer[1];
584
buffer[2] = _output_buffer[2];
585
buffer[3] = _output_buffer[3];
586
buffer[_input.block_size()] = _output_buffer[4];
587
buffer[_partition_size-1] = _output_buffer[5];
588
buffer[_partition_size-2] = _output_buffer[6];
589
buffer[_partition_size-3] = _output_buffer[7];
591
for (size_t i=0; i < (_partition_size / 8-1); i++)
593
for (size_t ii = 0; ii < 4; ii++)
595
buffer[base/2+ii] = _output_buffer[base+ii];
598
for (size_t ii = 0; ii < 4; ii++)
600
buffer[_partition_size-base/2-ii] = _output_buffer[base+4+ii];
606
std::copy(buffer.begin(), buffer.end(), _output_buffer.begin());
612
_unsort_coefficients();
613
fftw<float>::execute(_ifft_plan);
616
/** Convolution engine (output part).
617
* @see Input, StaticOutput
619
class Output : public OutputBase
622
Output(const Input& input)
624
, _queues(apf::make_index_iterator(size_t(1))
625
, apf::make_index_iterator(input.partitions()))
628
void set_filter(const Filter& filter);
630
bool queues_empty() const;
631
void rotate_queues();
634
fixed_vector<filter_ptrs_t> _queues;
637
/** Set a new filter.
638
* The first filter partition is updated immediately, the later partitions are
639
* updated with rotate_queues().
640
* @param filter Container with filter partitions. If too few partitions are
641
* given, the rest is set to zero, if too many are given, the rest is ignored.
644
Output::set_filter(const Filter& filter)
646
auto partition = filter.begin();
648
// First partition has no queue and is updated immediately
649
if (partition != filter.end())
651
_filter_ptrs.front() = &*partition++;
654
for (size_t i = 0; i < _queues.size(); ++i)
657
= (partition == filter.end()) ? &_empty_partition : &*partition++;
661
/** Check if there are still valid partitions in the queues.
662
* If this function returns @b false, rotate_queues() should be called.
663
* @note This is important for crossfades: even if set_filter() wasn't used,
664
* older partitions may still change! If the queues are empty, no crossfade is
665
* necessary (except @p weight was changed in convolve()).
668
Output::queues_empty() const
670
if (_queues.empty()) return true;
672
// It may not be obvious, but that's what the following code does:
673
// If some non-null pointer is found in the last queue, return false
675
auto first = _queues.rbegin()->begin();
676
auto last = _queues.rbegin()->end();
677
return std::find_if(first, last, math::identity<const fft_node*>()) == last;
680
/** Update filter queues.
681
* If queues_empty() returns @b true, calling this function is unnecessary.
682
* @note This can lead to artifacts, so a crossfade is recommended.
685
Output::rotate_queues()
687
auto target = _filter_ptrs.begin();
688
// Skip first element, it doesn't have a queue
691
for (auto& queue: _queues)
693
// If first element is valid, use it
694
if (queue.front()) *target = queue.front();
696
std::copy(queue.begin() + 1, queue.end(), queue.begin());
697
*queue.rbegin() = nullptr;
702
/** %Convolver output stage with static filter.
703
* The filter coefficients are set in the constructor(s) and cannot be changed.
706
class StaticOutput : public OutputBase
709
/// Constructor from time domain samples
710
template<typename In>
711
StaticOutput(const Input& input, In first, In last)
714
_filter.reset(new Filter(input.block_size(), first, last
715
, input.partitions()));
717
_set_filter(*_filter);
720
/// Constructor from existing frequency domain filter coefficients.
721
/// @attention The filter coefficients are not copied, their lifetime must
722
/// exceed that of the StaticOutput!
723
StaticOutput(const Input& input, const Filter& filter)
730
void _set_filter(const Filter& filter)
732
auto from = filter.begin();
734
for (auto& to: _filter_ptrs)
736
// If less partitions are given, the rest is set to zero
737
to = (from == filter.end()) ? &_empty_partition : &*from++;
739
// If further partitions are available, they are ignored
742
// This is only used for the first constructor!
743
std::unique_ptr<Filter> _filter;
746
/// Combination of Input and Output
747
struct Convolver : Input, Output
749
Convolver(size_t block_size_, size_t partitions_)
750
: Input(block_size_, partitions_)
751
// static_cast to resolve ambiguity
752
, Output(*static_cast<Input*>(this))
756
/// Combination of Input and StaticOutput
757
struct StaticConvolver : Input, StaticOutput
759
template<typename In>
760
StaticConvolver(size_t block_size_, In first, In last, size_t partitions_ = 0)
761
: Input(block_size_, partitions_ ? partitions_
762
: min_partitions(block_size_, std::distance(first, last)))
763
, StaticOutput(*this, first, last)
766
StaticConvolver(const Filter& filter, size_t partitions_ = 0)
767
: Input(filter.block_size()
768
, partitions_ ? partitions_ : filter.partitions())
769
, StaticOutput(*this, filter)
773
/// Apply @c std::transform to a container of fft_node%s
774
template<typename BinaryFunction>
775
void transform_nested(const Filter& in1, const Filter& in2, Filter& out
778
auto it1 = in1.begin();
779
auto it2 = in2.begin();
781
for (auto& result: out)
783
if (it1 == in1.end() || it1->zero)
785
if (it2 == in2.end() || it2->zero)
791
assert(it2->size() == result.size());
792
std::transform(it2->begin(), it2->end(), result.begin()
793
, std::bind(f, 0, std::placeholders::_1));
799
if (it2 == in2.end() || it2->zero)
801
assert(it1->size() == result.size());
802
std::transform(it1->begin(), it1->end(), result.begin()
803
, std::bind(f, std::placeholders::_1, 0));
808
assert(it1->size() == it2->size());
809
assert(it1->size() == result.size());
810
std::transform(it1->begin(), it1->end(), it2->begin(), result.begin()
815
if (it1 != in1.end()) ++it1;
816
if (it2 != in2.end()) ++it2;
826
// Settings for Vim (http://www.vim.org/), please do not remove:
827
// vim:softtabstop=2:shiftwidth=2:expandtab:textwidth=80:cindent
828
// vim:fdm=expr:foldexpr=getline(v\:lnum)=~'/\\*\\*'&&getline(v\:lnum)!~'\\*\\*/'?'a1'\:getline(v\:lnum)=~'\\*\\*/'&&getline(v\:lnum)!~'/\\*\\*'?'s1'\:'='