255
237
;-----------------------------------------------------------------------------
256
238
x264_pixel_ssd_16x8_sse2:
261
; %1=(row2, row0) %2=(row3, row1) %3=junk
262
; output in %1=(row3, row0) and %3=(row2, row1)
263
%macro HADAMARD4x4_SSE2 3
275
;;; two HADAMARD4x4_SSE2 running side-by-side
276
%macro HADAMARD4x4_TWO_SSE2 6 ; a02 a13 junk1 b02 b13 junk2 (1=4 2=5 3=6)
297
%macro TRANSPOSE4x4_TWIST_SSE2 3 ; %1=(row3, row0) %2=(row2, row1) %3=junk, output in %1 and %2
300
punpckhwd %2, %3 ; backwards because the high quadwords are already swapped
311
;;; two TRANSPOSE4x4_TWIST_SSE2 running side-by-side
312
%macro TRANSPOSE4x4_TWIST_TWO_SSE2 6 ; a02 a13 junk1 b02 b13 junk2 (1=4 2=5 3=6)
333
;;; loads the difference of two 4x4 blocks into xmm0,xmm1 and xmm4,xmm5 in interleaved-row order
335
;;; the value in xmm7 doesn't matter: it's only subtracted from itself
336
%macro LOAD4x8_DIFF_SSE2 0
357
punpcklqdq xmm0, xmm2 ; rows 0 and 2
358
punpckhqdq xmm4, xmm2 ; next 4x4 rows 0 and 2
368
punpcklqdq xmm1, xmm3 ; rows 1 and 3
369
punpckhqdq xmm5, xmm3 ; next 4x4 rows 1 and 3
257
SUMSUB_BADC %1, %2, %3, %4
258
SUMSUB_BADC %1, %3, %2, %4
262
SUMSUB_BADC %1, %5, %2, %6
263
SUMSUB_BADC %3, %7, %4, %8
264
SUMSUB_BADC %1, %3, %2, %4
265
SUMSUB_BADC %5, %7, %6, %8
266
SUMSUB_BADC %1, %2, %3, %4
267
SUMSUB_BADC %5, %6, %7, %8
276
%macro SBUTTERFLY2 5 ; not really needed, but allows transpose4x4 to not shuffle registers
282
%macro TRANSPOSE4x4D 5 ; ABCD-T -> ADTC
283
SBUTTERFLY dqa, dq, %1, %2, %5
284
SBUTTERFLY dqa, dq, %3, %4, %2
285
SBUTTERFLY dqa, qdq, %1, %3, %4
286
SBUTTERFLY dqa, qdq, %5, %2, %3
289
%macro TRANSPOSE2x4x4W 5 ; ABCD-T -> ABCD
290
SBUTTERFLY dqa, wd, %1, %2, %5
291
SBUTTERFLY dqa, wd, %3, %4, %2
292
SBUTTERFLY dqa, dq, %1, %3, %4
293
SBUTTERFLY2 dqa, dq, %5, %2, %3
294
SBUTTERFLY dqa, qdq, %1, %3, %2
295
SBUTTERFLY2 dqa, qdq, %4, %5, %3
298
%macro TRANSPOSE8x8 9 ; ABCDEFGH-T -> AFHDTECB
299
SBUTTERFLY dqa, wd, %1, %2, %9
300
SBUTTERFLY dqa, wd, %3, %4, %2
301
SBUTTERFLY dqa, wd, %5, %6, %4
302
SBUTTERFLY dqa, wd, %7, %8, %6
303
SBUTTERFLY dqa, dq, %1, %3, %8
304
SBUTTERFLY dqa, dq, %9, %2, %3
305
SBUTTERFLY dqa, dq, %5, %7, %2
306
SBUTTERFLY dqa, dq, %4, %6, %7
307
SBUTTERFLY dqa, qdq, %1, %5, %6
308
SBUTTERFLY dqa, qdq, %9, %4, %5
309
SBUTTERFLY dqa, qdq, %8, %2, %4
310
SBUTTERFLY dqa, qdq, %3, %7, %2
313
%macro LOAD_DIFF_8P 4 ; MMP, MMT, [pix1], [pix2]
372
321
%macro SUM1x8_SSE2 3 ; 01 junk sum
412
361
%macro SATD_TWO_SSE2 0
414
HADAMARD4x4_TWO_SSE2 xmm0, xmm1, xmm2, xmm4, xmm5, xmm3
415
TRANSPOSE4x4_TWIST_TWO_SSE2 xmm0, xmm2, xmm1, xmm4, xmm3, xmm5
416
HADAMARD4x4_TWO_SSE2 xmm0, xmm2, xmm1, xmm4, xmm3, xmm5
417
SUM4x4_TWO_SSE2 xmm0, xmm1, xmm2, xmm4, xmm5, xmm3, xmm6
362
LOAD_DIFF_8P xmm0, xmm4, [parm1q], [parm3q]
363
LOAD_DIFF_8P xmm1, xmm5, [parm1q+parm2q], [parm3q+parm4q]
364
LOAD_DIFF_8P xmm2, xmm4, [parm1q+2*parm2q], [parm3q+2*parm4q]
365
LOAD_DIFF_8P xmm3, xmm5, [parm1q+r10], [parm3q+r11]
366
lea parm1q, [parm1q+4*parm2q]
367
lea parm3q, [parm3q+4*parm4q]
369
HADAMARD1x4 xmm0, xmm1, xmm2, xmm3
370
TRANSPOSE2x4x4W xmm0, xmm1, xmm2, xmm3, xmm4
371
HADAMARD1x4 xmm0, xmm1, xmm2, xmm3
372
SUM4x4_TWO_SSE2 xmm0, xmm1, xmm4, xmm2, xmm3, xmm5, xmm6
420
375
%macro SATD_START 0
421
; mov rdi, rdi ; pix1
422
movsxd rsi, esi ; stride1
423
; mov rdx, rdx ; pix2
424
movsxd rcx, ecx ; stride2
428
381
%macro SATD_END 0
517
%macro LOAD_DIFF_8P 4 ; MMP, MMT, [pix1], [pix2]
531
%macro TRANSPOSE4x4D 5 ; abcd-t -> adtc
532
SBUTTERFLY dqa, dq, %1, %2, %5
533
SBUTTERFLY dqa, dq, %3, %4, %2
534
SBUTTERFLY dqa, qdq, %1, %3, %4
535
SBUTTERFLY dqa, qdq, %5, %2, %3
538
;-----------------------------------------------------------------------------
539
; input ABCDEFGH output AFHDTECB
540
;-----------------------------------------------------------------------------
541
%macro TRANSPOSE8x8 9
542
SBUTTERFLY dqa, wd, %1, %2, %9
543
SBUTTERFLY dqa, wd, %3, %4, %2
544
SBUTTERFLY dqa, wd, %5, %6, %4
545
SBUTTERFLY dqa, wd, %7, %8, %6
546
SBUTTERFLY dqa, dq, %1, %3, %8
547
SBUTTERFLY dqa, dq, %9, %2, %3
548
SBUTTERFLY dqa, dq, %5, %7, %2
549
SBUTTERFLY dqa, dq, %4, %6, %7
550
SBUTTERFLY dqa, qdq, %1, %5, %6
551
SBUTTERFLY dqa, qdq, %9, %4, %5
552
SBUTTERFLY dqa, qdq, %8, %2, %4
553
SBUTTERFLY dqa, qdq, %3, %7, %2
566
SUMSUB_BADC %1, %5, %2, %6
567
SUMSUB_BADC %3, %7, %4, %8
568
SUMSUB_BADC %1, %3, %2, %4
569
SUMSUB_BADC %5, %7, %6, %8
570
SUMSUB_BADC %1, %2, %3, %4
571
SUMSUB_BADC %5, %6, %7, %8
575
471
;-----------------------------------------------------------------------------
576
472
; int x264_pixel_sa8d_8x8_sse2( uint8_t *, int, uint8_t *, int )