1
# PiTiVi , Non-linear video editor
3
# timeline/alignalgs.py
5
# Copyright (c) 2011, Benjamin M. Schwartz <bens@alum.mit.edu>
7
# This program is free software; you can redistribute it and/or
8
# modify it under the terms of the GNU Lesser General Public
9
# License as published by the Free Software Foundation; either
10
# version 2.1 of the License, or (at your option) any later version.
12
# This program is distributed in the hope that it will be useful,
13
# but WITHOUT ANY WARRANTY; without even the implied warranty of
14
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
# Lesser General Public License for more details.
17
# You should have received a copy of the GNU Lesser General Public
18
# License along with this program; if not, write to the
19
# Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
20
# Boston, MA 02110-1301, USA.
23
Algorithms for aligning (i.e. registering, synchronizing) time series
39
def submax(left, middle, right):
41
Find the maximum of a quadratic function from three samples.
43
Given samples from a quadratic P(x) at x=-1, 0, and 1, find the x
44
that extremizes P. This is useful for determining the subsample
45
position of the extremum given three samples around the observed
48
@param left: value at x=-1
50
@param middle: value at x=0
51
@type middle: L{float}
52
@param right: value at x=1
54
@returns: value of x that extremizes the interpolating quadratic
58
L = middle - left # L and R are both positive if middle is the
59
R = middle - right # observed max of the integer samples
60
return 0.5 * (R - L) / (R + L)
61
# Derivation: Consider a quadratic q(x) := P(0) - P(x). Then q(x) has
62
# two roots, one at 0 and one at z, and the extreme is at (0+z)/2
64
# q(x) = bx*(x-z) # a may be positive or negative
65
# q(1) = b*(1 - z) = R
66
# q(-1) = b*(1 + z) = L
67
# (1+z)/(1-z) = L/R (from here it's just algebra to find a)
68
# z + 1 = R/L - (R/L)*z
70
# z = (R/L - 1)/(R/L + 1) = (R-L)/(R+L)
73
def rigidalign(reference, targets):
75
Estimate the relative shift between reference and targets.
77
The algorithm works by subtracting the mean, and then locating
78
the maximum of the cross-correlation. For inputs of length M{N},
79
the running time is M{O(C{len(targets)}*N*log(N))}.
81
@param reference: the waveform to regard as fixed
82
@type reference: Sequence(Number)
83
@param targets: the waveforms that should be aligned to reference
84
@type targets: Sequence(Sequence(Number))
85
@returns: The shift necessary to bring each target into alignment
86
with the reference. The returned shift may not be an integer,
87
indicating that the best alignment would be achieved by a
88
non-integer shift and appropriate interpolation.
89
@rtype: Sequence(Number)
92
# L is the maximum size of a cross-correlation between the
93
# reference and any of the targets.
94
L = len(reference) + max(len(t) for t in targets) - 1
95
# We round up L to the next power of 2 for speed in the FFT.
97
reference = reference - numpy.mean(reference)
98
fref = numpy.fft.rfft(reference, L).conj()
101
t = t - numpy.mean(t)
102
# Compute cross-correlation
103
xcorr = numpy.fft.irfft(fref * numpy.fft.rfft(t, L))
104
# shift maximizes dotproduct(t[shift:],reference)
105
# int() to convert numpy.int32 to python int
106
shift = int(numpy.argmax(xcorr))
107
subsample_shift = submax(xcorr[(shift - 1) % L],
109
xcorr[(shift + 1) % L])
110
shift = shift + subsample_shift
111
# shift is now a float indicating the interpolated maximum
112
if shift >= len(t): # Negative shifts appear large and positive
113
shift -= L # This corrects them to be negative
114
shifts.append(-shift)
115
# Sign reversed to move the target instead of the reference
120
# Helper function for affinealign
121
# The provided matrix a contains a bright line whose slope we want to know,
122
# against a noisy background.
123
# The line starts at 0,0. If the slope is positive, it runs toward the
124
# center of the matrix (i.e. toward (-1,-1))
125
# If the slope is negative, it wraps from 0,0 to 0,-1 and continues toward
126
# the center, (i.e. toward (-1,0)).
127
# The line segment terminates at the midline along the X direction.
128
# We locate the line by simply checking the sum along each possible line
129
# up to the Y-max edge of a. The caller sets the limit by choosing the
131
# The function returns a floating-point slope assuming that the matrix
132
# has "square pixels".
135
x_pos = numpy.arange(1, X)
136
x_neg = numpy.arange(2 * X - 1, X, -1)
139
for end in xrange(Y):
140
y = (x_pos * end) // X
141
s = numpy.sum(a[y, x_pos])
145
s = numpy.sum(a[y, x_neg])
149
return float(best_end) / X
152
def affinealign(reference, targets, max_drift=0.02):
153
""" EXPERIMENTAL FUNCTION.
155
Perform an affine registration between a reference and a number of
156
targets. Designed for aligning the amplitude envelopes of recordings of
157
the same event by different devices.
159
NOTE: This method is currently NOT USED by PiTiVi, as it has proven both
160
unnecessary and unusable. So far every test case has been registered
161
successfully by rigidalign, and until PiTiVi supports time-stretching of
162
audio, the drift calculation cannot actually be used.
164
@param reference: the reference signal to which others will be registered
165
@type reference: array(number)
166
@param targets: the signals to register
167
@type targets: ordered iterable(array(number))
168
@param max_drift: the maximum absolute clock drift rate
169
(i.e. stretch factor) that will be considered during search
170
@type max_drift: positive L{float}
171
@return: (offsets, drifts). offsets[i] is the point in reference at which
172
targets[i] starts. drifts[i] is the speed of targets[i] relative to
173
the reference (positive is faster, meaning the target should be
174
slowed down to be in sync with the reference)
176
L = len(reference) + max(len(t) for t in targets) - 1
178
bsize = int(20. / max_drift) # NEEDS TUNING
179
num_blocks = nextpow2(1.0 * len(reference) // bsize) # NEEDS TUNING
180
bspace = (len(reference) - bsize) // num_blocks
181
reference -= numpy.mean(reference)
183
# Construct FFT'd reference blocks
184
freference_blocks = numpy.zeros((L2 / 2 + 1, num_blocks),
186
for i in xrange(num_blocks):
188
tmp = numpy.zeros((L2,))
189
tmp[s:s + bsize] = reference[s:s + bsize]
190
freference_blocks[:, i] = numpy.fft.rfft(tmp, L2).conj()
191
freference_blocks[:10, :] = 0 # High-pass to ignore slow volume variations
197
ft = numpy.fft.rfft(t, L2)
198
#fxcorr is the FFT'd cross-correlation with the reference blocks
199
fxcorr_blocks = numpy.zeros((L2 / 2 + 1, num_blocks),
201
for i in xrange(num_blocks):
202
fxcorr_blocks[:, i] = ft * freference_blocks[:, i]
203
fxcorr_blocks[:, i] /= numpy.sqrt(numpy.sum(
204
fxcorr_blocks[:, i] ** 2))
206
# At this point xcorr_blocks would show a distinct bright line, nearly
207
# orthogonal to time, indicating where each of these blocks found their
208
# peak. Each point on this line represents the time in t where block i
209
# found its match. The time-intercept gives the time in b at which the
210
# reference starts, and the slope gives the amount by which the
211
# reference is faster relative to b.
213
# The challenge now is to find this line. Our strategy is to reduce the
214
# search to one dimension by first finding the slope.
215
# The Fourier Transform of a smooth real line in 2D is an orthogonal
216
# line through the origin, with phase that gives its position.
217
# Unfortunately this line is not clearly visible in fxcorr_blocks, so
218
# we discard the phase (by taking the absolute value) and then inverse
219
# transform. This places the line at the origin, so we can find its
222
# Construct the half-autocorrelation matrix
223
# (A true autocorrelation matrix would be ifft(abs(fft(x))**2), but this
224
# is just ifft(abs(fft(x))).)
225
# Construction is stepwise partly in an attempt to save memory
226
# The width is 2*num_blocks in order to avoid overlapping positive and
227
# negative correlations
228
halfautocorr = numpy.fft.fft(fxcorr_blocks, 2 * num_blocks, 1)
229
halfautocorr = numpy.abs(halfautocorr)
230
halfautocorr = numpy.fft.ifft(halfautocorr, None, 1)
231
halfautocorr = numpy.fft.irfft(halfautocorr, None, 0)
232
# Now it's actually the half-autocorrelation.
233
# Chop out the bit we don't care about
234
halfautocorr = halfautocorr[:bspace * num_blocks * max_drift, :]
235
# Remove the local-correlation peak.
236
halfautocorr[-1:2, -1:2] = 0 # NEEDS TUNING
237
# Normalize each column (appears to be necessary)
238
for i in xrange(2 * num_blocks):
239
halfautocorr[:, i] /= numpy.sqrt(numpy.sum(
240
halfautocorr[:, i] ** 2))
241
#from matplotlib.pyplot import imshow,show
242
#imshow(halfautocorr,interpolation='nearest',aspect='auto');show()
243
drift = _findslope(halfautocorr) / bspace
246
#inverse transform and shift everything into alignment
247
xcorr_blocks = numpy.fft.irfft(fxcorr_blocks, None, 0)
249
#TODO: see if phase ramps are worthwhile here
250
for i in xrange(num_blocks):
251
blockcenter = i * bspace + bsize / 2
252
shift = int(blockcenter * drift)
254
temp = xcorr_blocks[:shift, i].copy()
255
xcorr_blocks[:-shift, i] = xcorr_blocks[shift:, i].copy()
256
xcorr_blocks[-shift:, i] = temp
258
temp = xcorr_blocks[shift:, i].copy()
259
xcorr_blocks[-shift:, i] = xcorr_blocks[:shift, i].copy()
260
xcorr_blocks[:-shift, i] = temp
262
#from matplotlib.pyplot import imshow,show
263
#imshow(xcorr_blocks,interpolation='nearest',aspect='auto');show()
265
# xcorr is the drift-compensated cross-correlation
266
xcorr = numpy.sum(xcorr_blocks, axis=1)
269
offset = numpy.argmax(xcorr)
270
#from matplotlib.pyplot import plot,show
276
# now offset is the point in target at which reference starts and
277
# drift is the speed with which the reference drifts relative to the
278
# target. We reverse these relationships for the caller.
280
offsets.append(-offset / slope)
281
drifts.append(1 / slope - 1)
282
return offsets, drifts
284
if __name__ == '__main__':
285
# Simple command-line test
288
envelopes = [numpy.fromfile(n) for n in names]
289
reference = envelopes[-1]
290
offsets, drifts = affinealign(reference, envelopes, 0.02)
291
print offsets, drifts
292
from matplotlib.pyplot import *
294
for i in xrange(len(envelopes)):
295
t = offsets[i] + (1 + drifts[i]) * numpy.arange(len(envelopes[i]))
296
plot(t, envelopes[i] / numpy.sqrt(numpy.sum(envelopes[i] ** 2)))