1
/ synth_1to1_3dnow works the same way as the c version of
2
/ synth_1to1. this assembler code based 'decode-i586.s'
3
/ (by Stefan Bieschewski <stb@acm.org>), two types of changes
5
/ - use {MMX,3DNow!} instruction for reduce cpu
6
/ - remove unused(?) local symbols
8
/ useful sources of information on optimizing 3DNow! code include:
9
/ AMD 3DNow! Technology Manual (Publication #21928)
10
/ English: http://www.amd.com/K6/k6docs/pdf/21928d.pdf
11
/ (Japanese: http://www.amd.com/japan/K6/k6docs/j21928c.pdf)
12
/ AMD-K6-2 Processor Code Optimization Application Note (Publication #21924)
13
/ English: http://www.amd.com/K6/k6docs/pdf/21924b.pdf
15
/ This code was tested only AMD-K6-2 processor Linux systems,
17
/ - whether this code works on other 3DNow! capable processors
18
/ (ex.IDT-C6-2) or not
19
/ - whether this code works on other OSes or not
21
/ by KIMURA Takuhiro <kim@hannah.ipc.miyakyo-u.ac.jp> - until 31.Mar.1998
22
/ <kim@comtec.co.jp> - after 1.Apr.1998
24
/ Enhancments for q-word operation by Michael Hipp
2
/ decode_3dnow.s - 3DNow! optimized synth_1to1()
4
/ This code based 'decode_3dnow.s' by Syuuhei Kashiyama
5
/ <squash@mb.kcom.ne.jp>,only two types of changes have been made:
7
/ - remove PREFETCH instruction for speedup
8
/ - change function name for support 3DNow! automatic detect
9
/ - femms moved to before 'call dct64_3dnow'
11
/ You can find Kashiyama's original 3dnow! support patch
12
/ (for mpg123-0.59o) at
13
/ http://user.ecc.u-tokyo.ac.jp/~g810370/linux-simd/ (Japanese).
15
/ by KIMURA Takuhiro <kim@hannah.ipc.miyakyo-u.ac.jp> - until 31.Mar.1999
16
/ <kim@comtec.co.jp> - after 1.Apr.1999
20
/// Replacement of synth_1to1() with AMD's 3DNow! SIMD operations support
22
/// Syuuhei Kashiyama <squash@mb.kcom.ne.jp>
24
/// The author of this program disclaim whole expressed or implied
25
/// warranties with regard to this program, and in no event shall the
26
/// author of this program liable to whatever resulted from the use of
27
/// this program. Use it at your own risk.
31
.comm buffs.40,4352,32
33
39
.globl synth_1to1_3dnow
40
.type synth_1to1_3dnow,@function
61
call do_equalizer_3dnow
72
movl $buffs.40+2176,%ecx
61
leal (%ebx,%edx,4),%eax
66
90
leal 1088(,%eax,4),%eax
100
leal 1092(%ecx,%edx,4),%eax
70
102
leal 1088(%ecx),%ebx
74
leal 1092(%ecx,%ebp,4),%eax
76
leal (%ecx,%ebp,4),%eax
105
leal (%ecx,%edx,4),%eax
135
sar $16,%eax / new clip
144
/ --- end of loop 1 ---
188
sar $16,%eax / new clip
167
punpckldq 8(%ebx),%mm0
168
punpckldq 8(%edx),%mm1
172
punpckldq 24(%ebx),%mm3
173
punpckldq 24(%edx),%mm4
177
punpckldq 40(%ebx),%mm5
178
punpckldq 40(%edx),%mm6
183
punpckldq 56(%ebx),%mm1
184
punpckldq 56(%edx),%mm2
245
sar $16,%eax / new clip
198
leal -128(%edx,%esi,8),%edx
203
punpckldq 4(%ebx),%mm0
204
punpckldq -8(%edx),%mm1
210
punpckldq 12(%ebx),%mm3
211
punpckldq -16(%edx),%mm4
215
punpckldq 20(%ebx),%mm5
216
punpckldq -24(%edx),%mm6
221
punpckldq 28(%ebx),%mm1
222
punpckldq -32(%edx),%mm2
227
punpckldq 36(%ebx),%mm3
228
punpckldq -40(%edx),%mm4
233
punpckldq 44(%ebx),%mm5
234
punpckldq -48(%edx),%mm6
239
punpckldq 52(%ebx),%mm1
240
punpckldq -56(%edx),%mm2
245
punpckldq 60(%ebx),%mm3
246
punpckldq (%edx),%mm4
253
punpckldq 4(%ebx),%mm0
254
punpckldq -8(%edx),%mm1