3
# pitivi/autoaligner.py
5
# Copyright (c) 2011, Benjamin M. Schwartz <bens@alum.mit.edu>
7
# This program is free software; you can redistribute it and/or
8
# modify it under the terms of the GNU Lesser General Public
9
# License as published by the Free Software Foundation; either
10
# version 2.1 of the License, or (at your option) any later version.
12
# This program is distributed in the hope that it will be useful,
13
# but WITHOUT ANY WARRANTY; without even the implied warranty of
14
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
# Lesser General Public License for more details.
17
# You should have received a copy of the GNU Lesser General Public
18
# License along with this program; if not, write to the
19
# Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
20
# Boston, MA 02110-1301, USA.
22
# TODO reimplement after GES port
24
Classes for automatic alignment of L{Clip}s
27
from gi.repository import GObject
28
from gi.repository import Gst
31
from gi.repository import Gtk
40
from gettext import gettext as _
42
import pitivi.configure as configure
44
from pitivi.utils.ui import beautify_ETA
45
from pitivi.utils.misc import call_false
46
from pitivi.utils.extract import Extractee
47
from pitivi.utils.loggable import Loggable
57
def submax(left, middle, right):
59
Find the maximum of a quadratic function from three samples.
61
Given samples from a quadratic P(x) at x=-1, 0, and 1, find the x
62
that extremizes P. This is useful for determining the subsample
63
position of the extremum given three samples around the observed
66
@param left: value at x=-1
68
@param middle: value at x=0
69
@type middle: L{float}
70
@param right: value at x=1
72
@returns: value of x that extremizes the interpolating quadratic
76
L = middle - left # L and R are both positive if middle is the
77
R = middle - right # observed max of the integer samples
78
return 0.5 * (R - L) / (R + L)
79
# Derivation: Consider a quadratic q(x) := P(0) - P(x). Then q(x) has
80
# two roots, one at 0 and one at z, and the extreme is at (0+z)/2
82
# q(x) = bx*(x-z) # a may be positive or negative
83
# q(1) = b*(1 - z) = R
84
# q(-1) = b*(1 + z) = L
85
# (1+z)/(1-z) = L/R (from here it's just algebra to find a)
86
# z + 1 = R/L - (R/L)*z
88
# z = (R/L - 1)/(R/L + 1) = (R-L)/(R+L)
91
def rigidalign(reference, targets):
93
Estimate the relative shift between reference and targets.
95
The algorithm works by subtracting the mean, and then locating
96
the maximum of the cross-correlation. For inputs of length M{N},
97
the running time is M{O(C{len(targets)}*N*log(N))}.
99
@param reference: the waveform to regard as fixed
100
@type reference: Sequence(Number)
101
@param targets: the waveforms that should be aligned to reference
102
@type targets: Sequence(Sequence(Number))
103
@returns: The shift necessary to bring each target into alignment
104
with the reference. The returned shift may not be an integer,
105
indicating that the best alignment would be achieved by a
106
non-integer shift and appropriate interpolation.
107
@rtype: Sequence(Number)
110
# L is the maximum size of a cross-correlation between the
111
# reference and any of the targets.
112
L = len(reference) + max(len(t) for t in targets) - 1
113
# We round up L to the next power of 2 for speed in the FFT.
115
reference = reference - numpy.mean(reference)
116
fref = numpy.fft.rfft(reference, L).conj()
119
t = t - numpy.mean(t)
120
# Compute cross-correlation
121
xcorr = numpy.fft.irfft(fref * numpy.fft.rfft(t, L))
122
# shift maximizes dotproduct(t[shift:],reference)
123
# int() to convert numpy.int32 to python int
124
shift = int(numpy.argmax(xcorr))
125
subsample_shift = submax(xcorr[(shift - 1) % L],
127
xcorr[(shift + 1) % L])
128
shift = shift + subsample_shift
129
# shift is now a float indicating the interpolated maximum
130
if shift >= len(t): # Negative shifts appear large and positive
131
shift -= L # This corrects them to be negative
132
shifts.append(-shift)
133
# Sign reversed to move the target instead of the reference
138
# Helper function for affinealign
139
# The provided matrix a contains a bright line whose slope we want to know,
140
# against a noisy background.
141
# The line starts at 0,0. If the slope is positive, it runs toward the
142
# center of the matrix (i.e. toward (-1,-1))
143
# If the slope is negative, it wraps from 0,0 to 0,-1 and continues toward
144
# the center, (i.e. toward (-1,0)).
145
# The line segment terminates at the midline along the X direction.
146
# We locate the line by simply checking the sum along each possible line
147
# up to the Y-max edge of a. The caller sets the limit by choosing the
149
# The function returns a floating-point slope assuming that the matrix
150
# has "square pixels".
153
x_pos = numpy.arange(1, X)
154
x_neg = numpy.arange(2 * X - 1, X, -1)
157
for end in xrange(Y):
158
y = (x_pos * end) // X
159
s = numpy.sum(a[y, x_pos])
163
s = numpy.sum(a[y, x_neg])
167
return float(best_end) / X
170
def affinealign(reference, targets, max_drift=0.02):
171
""" EXPERIMENTAL FUNCTION.
173
Perform an affine registration between a reference and a number of
174
targets. Designed for aligning the amplitude envelopes of recordings of
175
the same event by different devices.
177
NOTE: This method is currently NOT USED by Pitivi, as it has proven both
178
unnecessary and unusable. So far every test case has been registered
179
successfully by rigidalign, and until Pitivi supports time-stretching of
180
audio, the drift calculation cannot actually be used.
182
@param reference: the reference signal to which others will be registered
183
@type reference: array(number)
184
@param targets: the signals to register
185
@type targets: ordered iterable(array(number))
186
@param max_drift: the maximum absolute clock drift rate
187
(i.e. stretch factor) that will be considered during search
188
@type max_drift: positive L{float}
189
@return: (offsets, drifts). offsets[i] is the point in reference at which
190
targets[i] starts. drifts[i] is the speed of targets[i] relative to
191
the reference (positive is faster, meaning the target should be
192
slowed down to be in sync with the reference)
194
L = len(reference) + max(len(t) for t in targets) - 1
196
bsize = int(20. / max_drift) # NEEDS TUNING
197
num_blocks = nextpow2(1.0 * len(reference) // bsize) # NEEDS TUNING
198
bspace = (len(reference) - bsize) // num_blocks
199
reference -= numpy.mean(reference)
201
# Construct FFT'd reference blocks
202
freference_blocks = numpy.zeros((L2 / 2 + 1, num_blocks),
204
for i in xrange(num_blocks):
206
tmp = numpy.zeros((L2,))
207
tmp[s:s + bsize] = reference[s:s + bsize]
208
freference_blocks[:, i] = numpy.fft.rfft(tmp, L2).conj()
209
freference_blocks[:10, :] = 0 # High-pass to ignore slow volume variations
215
ft = numpy.fft.rfft(t, L2)
216
#fxcorr is the FFT'd cross-correlation with the reference blocks
217
fxcorr_blocks = numpy.zeros((L2 / 2 + 1, num_blocks),
219
for i in xrange(num_blocks):
220
fxcorr_blocks[:, i] = ft * freference_blocks[:, i]
221
fxcorr_blocks[:, i] /= numpy.sqrt(numpy.sum(fxcorr_blocks[:, i] ** 2))
223
# At this point xcorr_blocks would show a distinct bright line, nearly
224
# orthogonal to time, indicating where each of these blocks found their
225
# peak. Each point on this line represents the time in t where block i
226
# found its match. The time-intercept gives the time in b at which the
227
# reference starts, and the slope gives the amount by which the
228
# reference is faster relative to b.
230
# The challenge now is to find this line. Our strategy is to reduce the
231
# search to one dimension by first finding the slope.
232
# The Fourier Transform of a smooth real line in 2D is an orthogonal
233
# line through the origin, with phase that gives its position.
234
# Unfortunately this line is not clearly visible in fxcorr_blocks, so
235
# we discard the phase (by taking the absolute value) and then inverse
236
# transform. This places the line at the origin, so we can find its
239
# Construct the half-autocorrelation matrix
240
# (A true autocorrelation matrix would be ifft(abs(fft(x))**2), but this
241
# is just ifft(abs(fft(x))).)
242
# Construction is stepwise partly in an attempt to save memory
243
# The width is 2*num_blocks in order to avoid overlapping positive and
244
# negative correlations
245
halfautocorr = numpy.fft.fft(fxcorr_blocks, 2 * num_blocks, 1)
246
halfautocorr = numpy.abs(halfautocorr)
247
halfautocorr = numpy.fft.ifft(halfautocorr, None, 1)
248
halfautocorr = numpy.fft.irfft(halfautocorr, None, 0)
249
# Now it's actually the half-autocorrelation.
250
# Chop out the bit we don't care about
251
halfautocorr = halfautocorr[:bspace * num_blocks * max_drift, :]
252
# Remove the local-correlation peak.
253
halfautocorr[-1:2, -1:2] = 0 # NEEDS TUNING
254
# Normalize each column (appears to be necessary)
255
for i in xrange(2 * num_blocks):
256
halfautocorr[:, i] /= numpy.sqrt(numpy.sum(halfautocorr[:, i] ** 2))
257
#from matplotlib.pyplot import imshow,show
258
#imshow(halfautocorr,interpolation='nearest',aspect='auto');show()
259
drift = _findslope(halfautocorr) / bspace
262
#inverse transform and shift everything into alignment
263
xcorr_blocks = numpy.fft.irfft(fxcorr_blocks, None, 0)
265
#TODO: see if phase ramps are worthwhile here
266
for i in xrange(num_blocks):
267
blockcenter = i * bspace + bsize / 2
268
shift = int(blockcenter * drift)
270
temp = xcorr_blocks[:shift, i].copy()
271
xcorr_blocks[:-shift, i] = xcorr_blocks[shift:, i].copy()
272
xcorr_blocks[-shift:, i] = temp
274
temp = xcorr_blocks[shift:, i].copy()
275
xcorr_blocks[-shift:, i] = xcorr_blocks[:shift, i].copy()
276
xcorr_blocks[:-shift, i] = temp
278
#from matplotlib.pyplot import imshow,show
279
#imshow(xcorr_blocks,interpolation='nearest',aspect='auto');show()
281
# xcorr is the drift-compensated cross-correlation
282
xcorr = numpy.sum(xcorr_blocks, axis=1)
285
offset = numpy.argmax(xcorr)
286
#from matplotlib.pyplot import plot,show
292
# now offset is the point in target at which reference starts and
293
# drift is the speed with which the reference drifts relative to the
294
# target. We reverse these relationships for the caller.
296
offsets.append(-offset / slope)
297
drifts.append(1 / slope - 1)
298
return offsets, drifts
301
def getAudioTrack(clip):
303
Helper function for getting an audio track from a Clip
305
@param clip: The Clip from which to locate an audio track
307
@returns: An audio track from clip, or None if clip has no audio track
308
@rtype: audio L{TrackElement} or L{NoneType}
310
for track in clip.track_elements:
311
if track.stream_type == AudioStream:
318
"""Abstract interface representing a progress meter."""
320
def addWatcher(self, function):
321
""" Add a progress watching callback function. This callback will
322
always be called from the main thread.
324
@param function: a function to call with progress updates.
325
@type function: callable(fractional_progress, time_remaining_text).
326
fractional_progress is a float normalized to [0,1].
327
time_remaining_text is a localized text string indicating the
328
estimated time remaining.
330
raise NotImplementedError
333
class ProgressAggregator(ProgressMeter):
335
"""A ProgressMeter that aggregates progress reports.
337
Reports from multiple sources are combined into a unified progress
343
# _targets is a list giving the size of each task.
345
# _portions is a list of the same length as _targets, indicating
346
# the portion of each task that as been completed (initially 0).
348
self._start = time.time()
351
def getPortionCB(self, target):
352
"""Prepare a new input for the Aggregator.
355
(in arbitrary units, but should be consistent across all calls on
356
a single ProgressAggregator object), it returns a callback that
357
can be used to update progress on this portion of the task.
359
@param target: the total task size for this portion
361
@returns: a callback that can be used to inform the Aggregator of
362
subsequent updates to this portion
363
@rtype: function(x), where x should be a number indicating the
364
absolute amount of this subtask that has been completed.
367
i = len(self._targets)
368
self._targets.append(target)
369
self._portions.append(0)
372
self._portions[i] = thusfar
373
GLib.idle_add(self._callForward)
376
def addWatcher(self, function):
377
self._watchers.append(function)
379
def _callForward(self):
380
# This function always returns False so that it may be safely
381
# invoked via GLib.idle_add(). Use of idle_add() is necessary
382
# to ensure that watchers are always called from the main thread,
383
# even if progress updates are received from other threads.
384
total_target = sum(self._targets)
385
total_completed = sum(self._portions)
386
if total_target == 0:
388
frac = min(1.0, float(total_completed) / total_target)
390
remaining = (now - self._start) * (1 - frac) / frac
391
for function in self._watchers:
392
function(frac, beautify_ETA(int(remaining * Gst.SECOND)))
396
class EnvelopeExtractee(Extractee, Loggable):
398
"""Class that computes the envelope of a 1-D signal (audio).
400
The envelope is defined as the sum of the absolute value of the signal
401
over each block. This class computes the envelope incrementally,
402
so that the entire signal does not ever need to be stored.
406
def __init__(self, blocksize, callback, *cbargs):
408
@param blocksize: the number of samples in a block
409
@type blocksize: L{int}
410
@param callback: a function to call when the extraction is complete.
411
The function's first argument will be a numpy array
412
representing the envelope, and any later argument to this
413
function will be passed as subsequent arguments to callback.
416
Loggable.__init__(self)
417
self._blocksize = blocksize
419
self._cbargs = cbargs
420
self._blocks = numpy.zeros((0,), dtype=numpy.float32)
421
self._empty = array.array('f', [])
422
# self._samples buffers up to self._threshold samples, before
423
# their envelope is computed and store in self._blocks, in order
424
# to amortize some of the function call overheads.
425
self._samples = array.array('f', [])
426
self._threshold = 2000 * blocksize
427
self._progress_watchers = []
429
def receive(self, a):
430
self._samples.extend(a)
431
if len(self._samples) < self._threshold:
434
self._process_samples()
436
def addWatcher(self, w):
438
Add a function to call with progress updates.
440
@param w: callback function
441
@type w: function(# of samples received so far)
444
self._progress_watchers.append(w)
446
def _process_samples(self):
447
excess = len(self._samples) % self._blocksize
449
samples_to_process = self._samples[:-excess]
450
self._samples = self._samples[-excess:]
452
samples_to_process = self._samples
453
self._samples = array.array('f', [])
454
self.debug("Adding %s samples to %s blocks",
455
len(samples_to_process), len(self._blocks))
456
newblocks = len(samples_to_process) // self._blocksize
457
samples_abs = numpy.abs(
458
samples_to_process).reshape((newblocks, self._blocksize))
459
self._blocks.resize((len(self._blocks) + newblocks,))
460
# This numpy.sum() call relies on samples_abs being a
461
# floating-point type. If samples_abs.dtype is int16
462
# then the sum may overflow.
463
self._blocks[-newblocks:] = numpy.sum(samples_abs, 1)
464
for w in self._progress_watchers:
465
w(self._blocksize * len(self._blocks) + excess)
468
self._process_samples() # absorb any remaining buffered samples
469
self._cb(self._blocks, *self._cbargs)
472
class AutoAligner(Loggable):
475
Class for aligning a set of L{Clip}s automatically.
477
The alignment is based on their contents, so that the shifted tracks
478
are synchronized. The current implementation only analyzes audio
479
data, so timeline objects without an audio track cannot be aligned.
485
@ivar BLOCKRATE: The number of amplitude blocks per second.
487
The AutoAligner works by computing the "amplitude envelope" of each
488
audio stream. We define an amplitude envelope as the absolute value
489
of the audio samples, downsampled to a low samplerate. This
490
samplerate, in Hz, is given by BLOCKRATE. (It is given this name
491
because the downsampling filter is implemented by very simple
492
averaging over blocks, i.e. a box filter.) 25 Hz appears to be a
493
good choice because it evenly divides all common audio samplerates
494
(e.g. 11025 and 8000). Lower blockrate requires less CPU time but
495
produces less accurate alignment. Higher blockrate is the reverse
496
(and also cannot evenly divide all samplerates).
500
def __init__(self, clips, callback):
502
@param clips: an iterable of L{Clip}s.
503
In this implementation, only L{Clip}s with at least one
504
audio track will be aligned.
505
@type clips: iter(L{Clip})
506
@param callback: A function to call when alignment is complete. No
507
arguments will be provided.
508
@type callback: function
511
Loggable.__init__(self)
512
# self._clips maps each object to its envelope. The values
513
# are initially None prior to envelope extraction.
514
self._clips = dict.fromkeys(clips)
515
self._callback = callback
516
# stack of (Track, Extractee) pairs waiting to be processed
517
# When start() is called, the stack will be populated, and then
518
# processed sequentially. Only one item from the stack will be
519
# actively in process at a time.
520
self._extraction_stack = []
525
Can an AutoAligner align these objects?
527
Determine whether a group of timeline objects can all
528
be aligned together by an AutoAligner.
530
@param clips: a group of timeline objects
531
@type clips: iterable(L{Clip})
532
@returns: True iff the objects can aligned.
536
# numpy is a "soft dependency". If you're running without numpy,
537
# this False return value is your only warning not to
538
# use the AutoAligner, which will crash immediately.
539
return all(getAudioTrack(t) is not None for t in clips)
541
def _extractNextEnvelope(self):
542
audiotrack, extractee = self._extraction_stack.pop()
543
r = RandomAccessAudioExtractor(audiotrack.factory,
545
r.extract(extractee, audiotrack.in_point,
546
audiotrack.out_point - audiotrack.in_point)
549
def _envelopeCb(self, array, clip):
550
self.debug("Receiving envelope for %s", clip)
551
self._clips[clip] = array
552
if self._extraction_stack:
553
self._extractNextEnvelope()
554
else: # This was the last envelope
555
self._performShifts()
560
Initiate the auto-alignment process.
562
@returns: a L{ProgressMeter} indicating the progress of the
564
@rtype: L{ProgressMeter}
567
progress_aggregator = ProgressAggregator()
568
pairs = [] # (Clip, {audio}TrackElement) pairs
569
for clip in self._clips.keys():
570
audiotrack = getAudioTrack(clip)
571
if audiotrack is not None:
572
pairs.append((clip, audiotrack))
573
else: # forget any Clip without an audio track
574
self._clips.pop(clip)
576
for clip, audiotrack in pairs:
577
# blocksize is the number of samples per block
578
blocksize = audiotrack.stream.rate // self.BLOCKRATE
579
extractee = EnvelopeExtractee(blocksize, self._envelopeCb, clip)
580
# numsamples is the total number of samples in the track,
581
# which is used by progress_aggregator to determine
582
# the percent completion.
583
numsamples = ((audiotrack.duration / Gst.SECOND) *
584
audiotrack.stream.rate)
585
extractee.addWatcher(progress_aggregator.getPortionCB(numsamples))
586
self._extraction_stack.append((audiotrack, extractee))
587
# After we return, start the extraction cycle.
588
# This GLib.idle_add call should not be necessary;
589
# we should be able to invoke _extractNextEnvelope directly
590
# here. However, there is some as-yet-unexplained
591
# race condition between the Python GIL, GTK UI updates,
592
# GLib mainloop, and pygst multithreading, resulting in
593
# occasional deadlocks during autoalignment.
594
# This call to idle_add() reportedly eliminates the deadlock.
596
GLib.idle_add(self._extractNextEnvelope)
597
else: # We can't do anything without at least two audio tracks
598
# After we return, call the callback function (once)
599
GLib.idle_add(call_false, self._callback)
600
return progress_aggregator
602
def _chooseReference(self):
604
Chooses the timeline object to use as a reference.
606
This function currently selects the one with lowest priority,
607
i.e. appears highest in the GUI. The behavior of this function
608
affects user interaction, because the user may want to
609
determine which object moves and which stays put.
611
@returns: the timeline object with lowest priority.
617
return min(self._clips.iterkeys(), key=priority)
619
def _performShifts(self):
620
self.debug("performing shifts")
621
reference = self._chooseReference()
622
# By using pop(), this line also removes the reference
623
# Clip and its envelope from further consideration,
624
# saving some CPU time in rigidalign.
625
reference_envelope = self._clips.pop(reference)
626
# We call list() because we need a reliable ordering of the pairs
627
# (In python 3, dict.items() returns an unordered dictview)
628
pairs = list(self._clips.items())
629
envelopes = [p[1] for p in pairs]
630
offsets = rigidalign(reference_envelope, envelopes)
631
for (movable, envelope), offset in zip(pairs, offsets):
632
# tshift is the offset rescaled to units of nanoseconds
633
tshift = int((offset * Gst.SECOND) / self.BLOCKRATE)
634
self.debug("Shifting %s to %i ns from %i",
635
movable, tshift, reference.start)
636
newstart = reference.start + tshift
638
movable.start = newstart
640
# Timeline objects always must have a positive start point, so
641
# if alignment would move an object to start at negative time,
642
# we instead make it start at zero and chop off the required
643
# amount at the beginning.
645
movable.in_point = movable.in_point - newstart
646
movable.duration += newstart
649
class AlignmentProgressDialog:
650
""" Dialog indicating the progress of the auto-alignment process.
651
Code derived from L{RenderingProgressDialog}, but greatly simplified
652
(read-only, no buttons)."""
654
def __init__(self, app):
655
self.builder = Gtk.Builder()
656
self.builder.add_from_file(os.path.join(configure.get_ui_dir(), "alignmentprogress.ui"))
657
self.builder.connect_signals(self)
659
self.window = self.builder.get_object("align-progress")
660
self.progressbar = self.builder.get_object("progressbar")
661
# Parent this dialog with mainwindow
662
# set_transient_for allows this dialog to properly
663
# minimize together with the mainwindow. This method is
664
# taken from RenderingProgressDialog. In both cases, it appears
665
# to work correctly, although there is a known bug for Gnome 3 in
666
# RenderingProgressDialog (bug #652917)
667
self.window.set_transient_for(app.gui)
669
# FIXME: Add a cancel button
671
def updatePosition(self, fraction, estimated):
672
self.progressbar.set_fraction(fraction)
673
self.window.set_title(_("%d%% Analyzed") % int(100 * fraction))
675
self.progressbar.set_text(_("About %s left") % estimated)
678
if __name__ == '__main__':
679
# Simple command-line test
682
envelopes = [numpy.fromfile(n) for n in names]
683
reference = envelopes[-1]
684
offsets, drifts = affinealign(reference, envelopes, 0.02)
685
print offsets, drifts
686
from matplotlib.pyplot import *
688
for i in xrange(len(envelopes)):
689
t = offsets[i] + (1 + drifts[i]) * numpy.arange(len(envelopes[i]))
690
plot(t, envelopes[i] / numpy.sqrt(numpy.sum(envelopes[i] ** 2)))