1
;*****************************************************************************
2
;* dct.asm: h264 encoder library
3
;*****************************************************************************
4
;* Copyright (C) 2003 x264 project
5
;* $Id: dct.asm,v 1.1 2004/06/03 19:27:07 fenrir Exp $
7
;* Authors: Laurent Aimar <fenrir@via.ecp.fr> (initial version)
8
;* Min Chen <chenm001.163.com> (converted to nasm)
9
;* Loren Merritt <lorenm@u.washington.edu> (dct8)
11
;* This program is free software; you can redistribute it and/or modify
12
;* it under the terms of the GNU General Public License as published by
13
;* the Free Software Foundation; either version 2 of the License, or
14
;* (at your option) any later version.
16
;* This program is distributed in the hope that it will be useful,
17
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19
;* GNU General Public License for more details.
21
;* You should have received a copy of the GNU General Public License
22
;* along with this program; if not, write to the Free Software
23
;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
24
;*****************************************************************************
26
;*****************************************************************************
28
;* Revision history: *
30
;* 2004.04.28 portab all 4x4 function to nasm (CM) *
32
;*****************************************************************************
36
;=============================================================================
37
; Macros and other preprocessor constants
38
;=============================================================================
40
%include "amd64inc.asm"
46
%macro MMX_LOAD_DIFF_4P 5
54
%macro MMX_LOAD_DIFF_8P 5
62
%macro MMX_SUMSUB_BA 2
68
%macro MMX_SUMSUB_BADC 4
77
%macro MMX_SUMSUB2_AB 3
85
%macro MMX_SUMSUBD2_AB 4
100
;-----------------------------------------------------------------------------
101
; input ABCD output ADTC
102
;-----------------------------------------------------------------------------
103
%macro MMX_TRANSPOSE 5
104
SBUTTERFLY q, wd, %1, %2, %5
105
SBUTTERFLY q, wd, %3, %4, %2
106
SBUTTERFLY q, dq, %1, %3, %4
107
SBUTTERFLY q, dq, %5, %2, %3
110
;-----------------------------------------------------------------------------
111
; input ABCDEFGH output AFHDTECB
112
;-----------------------------------------------------------------------------
113
%macro SSE2_TRANSPOSE8x8 9
114
SBUTTERFLY dqa, wd, %1, %2, %9
115
SBUTTERFLY dqa, wd, %3, %4, %2
116
SBUTTERFLY dqa, wd, %5, %6, %4
117
SBUTTERFLY dqa, wd, %7, %8, %6
118
SBUTTERFLY dqa, dq, %1, %3, %8
119
SBUTTERFLY dqa, dq, %9, %2, %3
120
SBUTTERFLY dqa, dq, %5, %7, %2
121
SBUTTERFLY dqa, dq, %4, %6, %7
122
SBUTTERFLY dqa, qdq, %1, %5, %6
123
SBUTTERFLY dqa, qdq, %9, %4, %5
124
SBUTTERFLY dqa, qdq, %8, %2, %4
125
SBUTTERFLY dqa, qdq, %3, %7, %2
128
%macro MMX_STORE_DIFF_4P 5
138
%macro MMX_STORE_DIFF_8P 4
147
;=============================================================================
149
;=============================================================================
155
;=============================================================================
157
;=============================================================================
161
;-----------------------------------------------------------------------------
162
; void x264_dct4x4dc_mmx( int16_t d[4][4] )
163
;-----------------------------------------------------------------------------
164
cglobal x264_dct4x4dc_mmx
165
movq mm0, [parm1q+ 0]
166
movq mm1, [parm1q+ 8]
167
movq mm2, [parm1q+16]
168
movq mm3, [parm1q+24]
170
MMX_SUMSUB_BADC mm1, mm0, mm3, mm2 ; mm1=s01 mm0=d01 mm3=s23 mm2=d23
171
MMX_SUMSUB_BADC mm3, mm1, mm2, mm0 ; mm3=s01+s23 mm1=s01-s23 mm2=d01+d23 mm0=d01-d23
173
MMX_TRANSPOSE mm3, mm1, mm0, mm2, mm4 ; in: mm3, mm1, mm0, mm2 out: mm3, mm2, mm4, mm0
175
MMX_SUMSUB_BADC mm2, mm3, mm0, mm4 ; mm2=s01 mm3=d01 mm0=s23 mm4=d23
176
MMX_SUMSUB_BADC mm0, mm2, mm4, mm3 ; mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23
178
movq mm6, [pw_1 GLOBAL]
193
;-----------------------------------------------------------------------------
194
; void x264_idct4x4dc_mmx( int16_t d[4][4] )
195
;-----------------------------------------------------------------------------
196
cglobal x264_idct4x4dc_mmx
197
movq mm0, [parm1q+ 0]
198
movq mm1, [parm1q+ 8]
199
movq mm2, [parm1q+16]
200
movq mm3, [parm1q+24]
202
MMX_SUMSUB_BADC mm1, mm0, mm3, mm2 ; mm1=s01 mm0=d01 mm3=s23 mm2=d23
203
MMX_SUMSUB_BADC mm3, mm1, mm2, mm0 ; mm3=s01+s23 mm1=s01-s23 mm2=d01+d23 mm0=d01-d23
205
MMX_TRANSPOSE mm3, mm1, mm0, mm2, mm4 ; in: mm3, mm1, mm0, mm2 out: mm3, mm2, mm4, mm0
207
MMX_SUMSUB_BADC mm2, mm3, mm0, mm4 ; mm2=s01 mm3=d01 mm0=s23 mm4=d23
208
MMX_SUMSUB_BADC mm0, mm2, mm4, mm3 ; mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23
210
movq [parm1q+ 0], mm0
211
movq [parm1q+ 8], mm2
212
movq [parm1q+16], mm3
213
movq [parm1q+24], mm4
216
;-----------------------------------------------------------------------------
217
; void x264_sub4x4_dct_mmx( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 )
218
;-----------------------------------------------------------------------------
219
cglobal x264_sub4x4_dct_mmx
223
MMX_LOAD_DIFF_4P mm0, mm6, mm7, [parm2q+0*FENC_STRIDE], [parm3q+0*FDEC_STRIDE]
224
MMX_LOAD_DIFF_4P mm1, mm6, mm7, [parm2q+1*FENC_STRIDE], [parm3q+1*FDEC_STRIDE]
225
MMX_LOAD_DIFF_4P mm2, mm6, mm7, [parm2q+2*FENC_STRIDE], [parm3q+2*FDEC_STRIDE]
226
MMX_LOAD_DIFF_4P mm3, mm6, mm7, [parm2q+3*FENC_STRIDE], [parm3q+3*FDEC_STRIDE]
228
MMX_SUMSUB_BADC mm3, mm0, mm2, mm1 ; mm3=s03 mm0=d03 mm2=s12 mm1=d12
230
MMX_SUMSUB_BA mm2, mm3 ; mm2=s03+s12 mm3=s03-s12
231
MMX_SUMSUB2_AB mm0, mm1, mm4 ; mm0=2.d03+d12 mm4=d03-2.d12
233
; transpose in: mm2, mm0, mm3, mm4, out: mm2, mm4, mm1, mm3
234
MMX_TRANSPOSE mm2, mm0, mm3, mm4, mm1
236
MMX_SUMSUB_BADC mm3, mm2, mm1, mm4 ; mm3=s03 mm2=d03 mm1=s12 mm4=d12
238
MMX_SUMSUB_BA mm1, mm3 ; mm1=s03+s12 mm3=s03-s12
239
MMX_SUMSUB2_AB mm2, mm4, mm0 ; mm2=2.d03+d12 mm0=d03-2.d12
241
movq [parm1q+ 0], mm1
242
movq [parm1q+ 8], mm2
243
movq [parm1q+16], mm3
244
movq [parm1q+24], mm0
247
;-----------------------------------------------------------------------------
248
; void x264_add4x4_idct_mmx( uint8_t *p_dst, int16_t dct[4][4] )
249
;-----------------------------------------------------------------------------
250
cglobal x264_add4x4_idct_mmx
252
movq mm0, [parm2q+ 0] ; dct
253
movq mm1, [parm2q+ 8]
254
movq mm2, [parm2q+16]
255
movq mm3, [parm2q+24]
257
MMX_SUMSUB_BA mm2, mm0 ; mm2=s02 mm0=d02
258
MMX_SUMSUBD2_AB mm1, mm3, mm5, mm4 ; mm1=s13 mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3)
260
MMX_SUMSUB_BADC mm1, mm2, mm4, mm0 ; mm1=s02+s13 mm2=s02-s13 mm4=d02+d13 mm0=d02-d13
262
; in: mm1, mm4, mm0, mm2 out: mm1, mm2, mm3, mm0
263
MMX_TRANSPOSE mm1, mm4, mm0, mm2, mm3
265
MMX_SUMSUB_BA mm3, mm1 ; mm3=s02 mm1=d02
266
MMX_SUMSUBD2_AB mm2, mm0, mm5, mm4 ; mm2=s13 mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3)
268
MMX_SUMSUB_BADC mm2, mm3, mm4, mm1 ; mm2=s02+s13 mm3=s02-s13 mm4=d02+d13 mm1=d02-d13
271
movq mm6, [pw_32 GLOBAL]
273
MMX_STORE_DIFF_4P mm2, mm0, mm6, mm7, [parm1q+0*FDEC_STRIDE]
274
MMX_STORE_DIFF_4P mm4, mm0, mm6, mm7, [parm1q+1*FDEC_STRIDE]
275
MMX_STORE_DIFF_4P mm1, mm0, mm6, mm7, [parm1q+2*FDEC_STRIDE]
276
MMX_STORE_DIFF_4P mm3, mm0, mm6, mm7, [parm1q+3*FDEC_STRIDE]
282
; =============================================================================
284
; =============================================================================
289
MMX_SUMSUB_BA %8, %1 ; %8=s07, %1=d07
290
MMX_SUMSUB_BA %7, %2 ; %7=s16, %2=d16
291
MMX_SUMSUB_BA %6, %3 ; %6=s25, %3=d25
292
MMX_SUMSUB_BA %5, %4 ; %5=s34, %4=d34
294
MMX_SUMSUB_BA %5, %8 ; %5=a0, %8=a2
295
MMX_SUMSUB_BA %6, %7 ; %6=a1, %7=a3
307
psubw %10, %3 ; %10=a7
317
MMX_SUMSUB_BA %6, %5 ; %6=b0, %5=b4
323
psubw %9, %10 ; %9=b7
338
;-----------------------------------------------------------------------------
339
; void __cdecl x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
340
;-----------------------------------------------------------------------------
341
cglobal x264_sub8x8_dct8_sse2
344
MMX_LOAD_DIFF_8P xmm0, xmm8, xmm9, [parm2q+0*FENC_STRIDE], [parm3q+0*FDEC_STRIDE]
345
MMX_LOAD_DIFF_8P xmm1, xmm8, xmm9, [parm2q+1*FENC_STRIDE], [parm3q+1*FDEC_STRIDE]
346
MMX_LOAD_DIFF_8P xmm2, xmm8, xmm9, [parm2q+2*FENC_STRIDE], [parm3q+2*FDEC_STRIDE]
347
MMX_LOAD_DIFF_8P xmm3, xmm8, xmm9, [parm2q+3*FENC_STRIDE], [parm3q+3*FDEC_STRIDE]
348
MMX_LOAD_DIFF_8P xmm4, xmm8, xmm9, [parm2q+4*FENC_STRIDE], [parm3q+4*FDEC_STRIDE]
349
MMX_LOAD_DIFF_8P xmm5, xmm8, xmm9, [parm2q+5*FENC_STRIDE], [parm3q+5*FDEC_STRIDE]
350
MMX_LOAD_DIFF_8P xmm6, xmm8, xmm9, [parm2q+6*FENC_STRIDE], [parm3q+6*FDEC_STRIDE]
351
MMX_LOAD_DIFF_8P xmm7, xmm8, xmm9, [parm2q+7*FENC_STRIDE], [parm3q+7*FDEC_STRIDE]
353
DCT8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9
354
SSE2_TRANSPOSE8x8 xmm5, xmm1, xmm2, xmm6, xmm4, xmm3, xmm7, xmm8, xmm0
355
DCT8_1D xmm5, xmm3, xmm8, xmm6, xmm0, xmm4, xmm2, xmm1, xmm7, xmm9
357
movdqa [parm1q+0x00], xmm4
358
movdqa [parm1q+0x10], xmm3
359
movdqa [parm1q+0x20], xmm8
360
movdqa [parm1q+0x30], xmm2
361
movdqa [parm1q+0x40], xmm0
362
movdqa [parm1q+0x50], xmm6
363
movdqa [parm1q+0x60], xmm1
364
movdqa [parm1q+0x70], xmm7
372
MMX_SUMSUB_BA %5, %1 ; %5=a0, %1=a2
377
paddw %7, %10 ; %7=a6
389
psubw %10, %2 ; %10=a5
400
MMX_SUMSUB_BA %7, %5 ; %7=b0, %5=b6
401
MMX_SUMSUB_BA %3, %1 ; %3=b2, %1=b4
413
psubw %2, %10 ; %2=b5
415
MMX_SUMSUB_BA %9, %7 ; %9=c0, %7=c7
416
MMX_SUMSUB_BA %2, %3 ; %2=c1, %3=c6
417
MMX_SUMSUB_BA %8, %1 ; %8=c2, %1=c5
418
MMX_SUMSUB_BA %4, %5 ; %4=c3, %5=c4
421
;-----------------------------------------------------------------------------
422
; void __cdecl x264_add8x8_idct8_sse2( uint8_t *p_dst, int16_t dct[8][8] )
423
;-----------------------------------------------------------------------------
424
cglobal x264_add8x8_idct8_sse2
425
movdqa xmm0, [parm2q+0x00]
426
movdqa xmm1, [parm2q+0x10]
427
movdqa xmm2, [parm2q+0x20]
428
movdqa xmm3, [parm2q+0x30]
429
movdqa xmm4, [parm2q+0x40]
430
movdqa xmm5, [parm2q+0x50]
431
movdqa xmm6, [parm2q+0x60]
432
movdqa xmm7, [parm2q+0x70]
434
IDCT8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm9, xmm8
435
SSE2_TRANSPOSE8x8 xmm9, xmm1, xmm7, xmm3, xmm4, xmm0, xmm2, xmm6, xmm5
436
paddw xmm9, [pw_32 GLOBAL] ; rounding for the >>6 at the end
437
IDCT8_1D xmm9, xmm0, xmm6, xmm3, xmm5, xmm4, xmm7, xmm1, xmm8, xmm2
440
MMX_STORE_DIFF_8P xmm8, xmm14, xmm15, [parm1q+0*FDEC_STRIDE]
441
MMX_STORE_DIFF_8P xmm0, xmm14, xmm15, [parm1q+1*FDEC_STRIDE]
442
MMX_STORE_DIFF_8P xmm1, xmm14, xmm15, [parm1q+2*FDEC_STRIDE]
443
MMX_STORE_DIFF_8P xmm3, xmm14, xmm15, [parm1q+3*FDEC_STRIDE]
444
MMX_STORE_DIFF_8P xmm5, xmm14, xmm15, [parm1q+4*FDEC_STRIDE]
445
MMX_STORE_DIFF_8P xmm9, xmm14, xmm15, [parm1q+5*FDEC_STRIDE]
446
MMX_STORE_DIFF_8P xmm6, xmm14, xmm15, [parm1q+6*FDEC_STRIDE]
447
MMX_STORE_DIFF_8P xmm7, xmm14, xmm15, [parm1q+7*FDEC_STRIDE]
452
;-----------------------------------------------------------------------------
453
; void __cdecl x264_sub8x8_dct_mmx( int16_t dct[4][4][4],
454
; uint8_t *pix1, uint8_t *pix2 )
455
;-----------------------------------------------------------------------------
460
add parm2q, %4-%5*FENC_STRIDE
461
add parm3q, %4-%5*FDEC_STRIDE
464
add parm2q, %4*FENC_STRIDE-%6
465
add parm3q, %4*FDEC_STRIDE-%6
468
add parm2q, %4-%5*FENC_STRIDE
469
add parm3q, %4-%5*FDEC_STRIDE
473
;-----------------------------------------------------------------------------
474
; void __cdecl x264_add8x8_idct_mmx( uint8_t *pix, int16_t dct[4][4][4] )
475
;-----------------------------------------------------------------------------
476
%macro ADD_NxN_IDCT 6
479
add parm1q, %4-%5*FDEC_STRIDE
482
add parm1q, %4*FDEC_STRIDE-%6
485
add parm1q, %4-%5*FDEC_STRIDE
490
SUB_NxN_DCT x264_sub8x8_dct_mmx, x264_sub4x4_dct_mmx, 32, 4, 0, 4
491
ADD_NxN_IDCT x264_add8x8_idct_mmx, x264_add4x4_idct_mmx, 32, 4, 0, 4
493
SUB_NxN_DCT x264_sub16x16_dct_mmx, x264_sub8x8_dct_mmx, 32, 4, 4, 12
494
ADD_NxN_IDCT x264_add16x16_idct_mmx, x264_add8x8_idct_mmx, 32, 4, 4, 12
496
SUB_NxN_DCT x264_sub16x16_dct8_sse2, x264_sub8x8_dct8_sse2, 128, 8, 0, 8
497
ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2, 128, 8, 0, 8
500
;-----------------------------------------------------------------------------
501
; void __cdecl x264_zigzag_scan_4x4_field_sse2( int level[16], int16_t dct[4][4] )
502
;-----------------------------------------------------------------------------
503
cglobal x264_zigzag_scan_4x4_field_sse2
504
punpcklwd xmm0, [parm2q]
505
punpckhwd xmm1, [parm2q]
506
punpcklwd xmm2, [parm2q+16]
507
punpckhwd xmm3, [parm2q+16]
513
movdqa [parm1q+16], xmm1
514
movdqa [parm1q+32], xmm2
516
movdqa [parm1q+48], xmm3
517
movq [parm1q+12], xmm0
518
movd [parm1q+ 8], xmm1