3
/ Copyrights 1999 by Michael Hipp
5
/ Not really optimized. Just using 3dnow instead of
6
/ standard FPU instructions enhances performance a lot.
2
/// Replacement of dct64() with AMD's 3DNow! SIMD operations support
4
/// Syuuhei Kashiyama <squash@mb.kcom.ne.jp>
6
/// The author of this program disclaim whole expressed or implied
7
/// warranties with regard to this program, and in no event shall the
8
/// author of this program liable to whatever resulted from the use of
9
/// this program. Use it at your own risk.
13
.type dct64_3dnow,@function
19
subl $256,%esp / tmp-buff
401
movd %mm3,28(%edx) / 7
403
pfsub %mm1,%mm0 / 5 = 4 - 5
404
pfadd %mm3,%mm6 / 6 += 7
407
pfadd %mm6,%mm7 / 4 += 6
410
pfadd %mm0,%mm6 / 6 += 5
414
pfadd %mm3,%mm0 / 5 += 7
423
movl 8(%ebp),%ebx / out0
424
movl 12(%ebp),%edi / out1
426
subl %ebx,%edi / allows ebx 16 bit relative addressing
427
/ maybe only �as� need this
430
movl %eax,0x40*16(%ebx)
432
movl %eax,0x40*12(%ebx)
434
movl %eax,0x40*8(%ebx)
436
movl %eax,0x40*4(%ebx)
439
movl %eax,(%ebx,%edi)
441
movl %eax,0x40*4(%ebx,%edi)
443
movl %eax,0x40*8(%ebx,%edi)
445
movl %eax,0x40*12(%ebx,%edi)
447
movd 32(%edx),%mm0 / 8
448
movd 48(%edx),%mm1 / C
450
movd %mm0,4*0xe0(%ebx)
453
movd %mm1,4*0xa0(%ebx)
456
movd %mm0,4*0x60(%ebx)
460
movd %mm1,4*0x20(%ebx)
463
movd %mm0,4*0x20(%ebx,%edi)
466
movd %mm1,4*0x60(%ebx,%edi)
470
movd %mm0,4*0xa0(%ebx,%edi)
471
movd %mm1,4*0xe0(%ebx,%edi)
475
movq 4*0x10(%edx),%mm2
476
movq 4*0x18(%edx),%mm0
478
movq 4*0x1c(%edx),%mm1
481
movd %mm2,4*0x10*15(%ebx)
483
movd %mm2,4*0x10*1(%ebx,%edi)
484
movq 4*0x14(%edx),%mm2
486
movd %mm0,4*0x10*13(%ebx)
33
punpckldq 120(%edi),%mm2
45
punpckldq 112(%edi),%mm6
57
punpckldq 104(%edi),%mm2
69
punpckldq 96(%edi),%mm6
81
punpckldq 88(%edi),%mm2
93
punpckldq 80(%edi),%mm6
105
punpckldq 72(%edi),%mm2
117
punpckldq 64(%edi),%mm6
133
punpckldq 56(%ebx),%mm2
146
punpckldq 120(%ebx),%mm2
158
punpckldq 48(%ebx),%mm6
171
punpckldq 112(%ebx),%mm6
183
punpckldq 40(%ebx),%mm2
196
punpckldq 104(%ebx),%mm2
208
punpckldq 32(%ebx),%mm6
221
punpckldq 96(%ebx),%mm6
238
punpckldq 24(%esi),%mm4
250
punpckldq 16(%esi),%mm7
262
punpckldq 56(%esi),%mm4
274
punpckldq 48(%esi),%mm7
286
punpckldq 88(%esi),%mm4
298
punpckldq 80(%esi),%mm7
310
punpckldq 120(%esi),%mm4
322
punpckldq 112(%esi),%mm7
338
punpckldq 8(%ebx),%mm3
350
punpckldq 24(%ebx),%mm6
362
punpckldq 40(%ebx),%mm3
374
punpckldq 56(%ebx),%mm6
386
punpckldq 72(%ebx),%mm3
398
punpckldq 88(%ebx),%mm6
410
punpckldq 104(%ebx),%mm3
422
punpckldq 120(%ebx),%mm6
488
movd %mm0,4*0x10*3(%ebx,%edi)
490
movq 4*0x1a(%edx),%mm0
493
movd %mm2,4*0x10*11(%ebx)
495
movd %mm2,4*0x10*5(%ebx,%edi)
496
movq 4*0x12(%edx),%mm2
498
movd %mm1,4*0x10*9(%ebx)
500
movd %mm1,4*0x10*7(%ebx,%edi)
502
movq 4*0x1e(%edx),%mm1
505
movd %mm2,4*0x10*7(%ebx)
507
movd %mm2,4*0x10*9(%ebx,%edi)
508
movq 4*0x16(%edx),%mm2
510
movd %mm0,4*0x10*5(%ebx)
512
movd %mm0,4*0x10*11(%ebx,%edi)
514
movd 4*0x19(%edx),%mm0
517
movd %mm2,4*0x10*3(%ebx)
519
movd %mm2,4*0x10*13(%ebx,%edi)
520
movd 4*0x11(%edx),%mm2
523
movd %mm1,4*0x10*1(%ebx)
525
movd %mm1,4*0x10*15(%ebx,%edi)