2
Copyright (C) 2005-2006 Paul Davis, John Rigg
4
This program is free software; you can redistribute it and/or modify
5
it under the terms of the GNU General Public License as published by
6
the Free Software Foundation; either version 2 of the License, or
7
(at your option) any later version.
9
This program is distributed in the hope that it will be useful,
10
but WITHOUT ANY WARRANTY; without even the implied warranty of
11
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
GNU General Public License for more details.
14
You should have received a copy of the GNU General Public License
15
along with this program; if not, write to the Free Software
16
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18
Author: Sampo Savolainen
19
64-bit conversion: John Rigg
25
#; void x86_sse_mix_buffers_with_gain (float *dst, float *src, unsigned int nframes, float gain);
27
.globl x86_sse_mix_buffers_with_gain
28
.type x86_sse_mix_buffers_with_gain,@function
30
x86_sse_mix_buffers_with_gain:
34
#; %rdx unsigned int nframes
45
#; if nframes == 0, go to end
49
#; Check for alignment
52
andq $12, %rax #; mask alignment offset
55
andq $12, %rbx #; mask alignment offset
58
jne .MBWG_NONALIGN #; if not aligned, calculate manually
64
#; Pre-loop, we need to run 1-3 frames "manually" without
69
#; gain is already in %xmm0
75
addq $4, %rdi #; dst++
76
addq $4, %rsi #; src++
77
decq %rdx #; nframes--
82
cmp $16, %rbx #; test if we've reached 16 byte alignment
88
cmp $4, %rdx #; we know it's not zero, but if it's not >=4, then
89
jnge .MBWG_NONALIGN #; we jump straight to the "normal" code
91
#; gain is already in %xmm0
92
shufps $0x00, %xmm0, %xmm0
97
movaps (%rsi), %xmm1 #; source => xmm0
98
mulps %xmm0, %xmm1 #; apply gain to source
99
addps (%rdi), %xmm1 #; mix with destination
100
movaps %xmm1, (%rdi) #; copy result to destination
102
addq $16, %rdi #; dst+=4
103
addq $16, %rsi #; src+=4
105
subq $4, %rdx #; nframes-=4
112
#; if there are remaining frames, the nonalign code will do nicely
113
#; for the rest 1-3 frames.
118
#; gain is already in %xmm0
131
jnz .MBWG_NONALIGNLOOP
143
.size x86_sse_mix_buffers_with_gain, .-x86_sse_mix_buffers_with_gain
146
#; void x86_sse_mix_buffers_no_gain (float *dst, float *src, unsigned int nframes);
148
.globl x86_sse_mix_buffers_no_gain
149
.type x86_sse_mix_buffers_no_gain,@function
151
x86_sse_mix_buffers_no_gain:
155
#; %rdx unsigned int nframes
160
#; save the registers
167
#; if nframes == 0, go to end
171
#; Check for alignment
174
andq $12, %rax #; mask alignment offset
177
andq $12, %rbx #; mask alignment offset
180
jne .MBNG_NONALIGN #; if not aligned, calculate manually
185
#; Pre-loop, we need to run 1-3 frames "manually" without
194
addq $4, %rdi #; dst++
195
addq $4, %rsi #; src++
196
decq %rdx #; nframes--
200
cmp $16, %rbx #; test if we've reached 16 byte alignment
205
cmp $4, %rdx #; if there are frames left, but less than 4
206
jnge .MBNG_NONALIGN #; we can't run SSE
210
movaps (%rsi), %xmm0 #; source => xmm0
211
addps (%rdi), %xmm0 #; mix with destination
212
movaps %xmm0, (%rdi) #; copy result to destination
214
addq $16, %rdi #; dst+=4
215
addq $16, %rsi #; src+=4
217
subq $4, %rdx #; nframes-=4
224
#; if there are remaining frames, the nonalign code will do nicely
225
#; for the rest 1-3 frames.
230
movss (%rsi), %xmm0 #; src => xmm0
231
addss (%rdi), %xmm0 #; xmm0 += dst
232
movss %xmm0, (%rdi) #; xmm0 => dst
250
.size x86_sse_mix_buffers_no_gain, .-x86_sse_mix_buffers_no_gain
253
#; void x86_sse_apply_gain_to_buffer (float *buf, unsigned int nframes, float gain);
255
.globl x86_sse_apply_gain_to_buffer
256
.type x86_sse_apply_gain_to_buffer,@function
258
x86_sse_apply_gain_to_buffer:
260
#; %rdi float *buf 32(%rbp)
261
#; %rsi unsigned int nframes
263
#; %xmm1 float buf[0]
273
#; if nframes == 0, go to end
274
movq %rsi, %rcx #; nframes
278
#; set up the gain buffer (gain is already in %xmm0)
279
shufps $0x00, %xmm0, %xmm0
281
#; Check for alignment
283
movq %rdi, %rdx #; buf => %rdx
284
andq $12, %rdx #; mask bits 1 & 2, result = 0, 4, 8 or 12
285
jz .AG_SSE #; if buffer IS aligned
288
#; we iterate 1-3 times, doing normal x87 float comparison
289
#; so we reach a 16 byte aligned "buf" (=%rdi) value
293
#; Load next value from the buffer into %xmm1
298
#; increment buffer, decrement counter
299
addq $4, %rdi #; buf++;
301
decq %rcx #; nframes--
302
jz .AG_END #; if we run out of frames, we go to the end
304
addq $4, %rdx #; one non-aligned byte less
306
jne .AGLP_START #; if more non-aligned frames exist, we do a do-over
310
#; We have reached the 16 byte aligned "buf" ("rdi") value
312
#; Figure out how many loops we should do
313
movq %rcx, %rax #; copy remaining nframes to %rax for division
314
movq $0, %rdx #; 0 the edx register
319
divq %rdi #; %rdx = remainder == 0
322
#; %rax = SSE iterations
334
subq $4, %rcx #; nframes-=4
339
#; Next we need to post-process all remaining frames
340
#; the remaining frame count is in %rcx
342
#; if no remaining frames, jump to the end
344
andq $3, %rcx #; nframes % 4
353
#; increment buffer, decrement counter
354
addq $4, %rdi #; buf++;
356
decq %rcx #; nframes--
357
jnz .AGPOST_START #; if we run out of frames, we go to the end
368
.size x86_sse_apply_gain_to_buffer, .-x86_sse_apply_gain_to_buffer
372
#; x86_sse_apply_gain_vector(float *buf, float *gain_vector, unsigned int nframes)
374
.globl x86_sse_apply_gain_vector
375
.type x86_sse_apply_gain_vector,@function
377
x86_sse_apply_gain_vector:
380
#; %rsi float *gain_vector
381
#; %rdx unsigned int nframes
391
#; if nframes == 0 go to end
406
jz .AGA_SSE #; if buffers are aligned, jump to the SSE loop
408
#; Buffers aren't 16 byte aligned, but they are unaligned by the same amount
411
movss (%rdi), %xmm0 #; buf => xmm0
412
movss (%rsi), %xmm1 #; gain value => xmm1
413
mulss %xmm1, %xmm0 #; xmm1 * xmm0 => xmm0
414
movss %xmm0, (%rdi) #; signal with gain => buf
419
addq $4, %rdi #; buf++
420
addq $4, %rsi #; gab++
426
#; There are frames left for sure, as that is checked in the beginning
427
#; and within the previous loop. BUT, there might be less than 4 frames
431
movq %rdx, %rax #; nframes => %rax
432
shr $2, %rax #; unsigned divide by 4
434
cmp $0, %rax #; Jos toimii ilman t�t�, niin kiva
449
andq $3, %rdx #; Remaining frames are nframes & 3
453
#; Inside this loop, we know there are frames left to process
454
#; but because either there are < 4 frames left, or the buffers
455
#; are not aligned, we can't use the parallel SSE ops
457
movss (%rdi), %xmm0 #; buf => xmm0
458
movss (%rsi), %xmm1 #; gain value => xmm1
459
mulss %xmm1, %xmm0 #; xmm1 * xmm0 => xmm0
460
movss %xmm0, (%rdi) #; signal with gain => buf
464
decq %rdx #; nframes--
476
.size x86_sse_apply_gain_vector, .-x86_sse_apply_gain_vector
480
#; float x86_sse_compute_peak(float *buf, long nframes, float current);
482
.globl x86_sse_compute_peak
483
.type x86_sse_compute_peak,@function
486
x86_sse_compute_peak:
488
#; %rdi float *buf 32(%rbp)
489
#; %rsi unsigned int nframes
490
#; %xmm0 float current
491
#; %xmm1 float buf[0]
499
#; if nframes == 0, go to end
500
movq %rsi, %rcx #; nframes
504
#; create the "abs" mask in %xmm2
508
shufps $0x00, %xmm2, %xmm2
510
#; Check for alignment
512
#;movq 8(%rbp), %rdi #; buf
513
movq %rdi, %rdx #; buf => %rdx
514
andq $12, %rdx #; mask bits 1 & 2, result = 0, 4, 8 or 12
515
jz .CP_SSE #; if buffer IS aligned
518
#; we iterate 1-3 times, doing normal x87 float comparison
519
#; so we reach a 16 byte aligned "buf" (=%rdi) value
523
#; Load next value from the buffer
528
#; increment buffer, decrement counter
529
addq $4, %rdi #; buf++;
531
decq %rcx #; nframes--
532
jz .CP_END #; if we run out of frames, we go to the end
534
addq $4, %rdx #; one non-aligned byte less
536
jne .LP_START #; if more non-aligned frames exist, we do a do-over
540
#; We have reached the 16 byte aligned "buf" ("rdi") value
542
#; Figure out how many loops we should do
543
movq %rcx, %rax #; copy remaining nframes to %rax for division
545
shr $2,%rax #; unsigned divide by 4
548
#; %rax = SSE iterations
550
#; current maximum is at %xmm0, but we need to ..
551
shufps $0x00, %xmm0, %xmm0 #; shuffle "current" to all 4 FP's
553
#;prefetcht0 16(%rdi)
566
#; Calculate the maximum value contained in the 4 FP's in %xmm0
568
shufps $0x4e, %xmm1, %xmm1 #; shuffle left & right pairs (1234 => 3412)
569
maxps %xmm1, %xmm0 #; maximums of the two pairs
571
shufps $0xb1, %xmm1, %xmm1 #; shuffle the floats inside the two pairs (1234 => 2143)
574
#; now every float in %xmm0 is the same value, current maximum value
576
#; Next we need to post-process all remaining frames
577
#; the remaining frame count is in %rcx
579
#; if no remaining frames, jump to the end
581
andq $3, %rcx #; nframes % 4
590
addq $4, %rdi #; buf++;
592
decq %rcx #; nframes--;
603
.size x86_sse_compute_peak, .-x86_sse_compute_peak
607
.section .note.GNU-stack,"",%progbits