2
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4
; Use of this source code is governed by a BSD-style license
5
; that can be found in the LICENSE file in the root of the source
6
; tree. An additional intellectual property rights grant can be found
7
; in the file PATENTS. All contributing project authors may
8
; be found in the AUTHORS file in the root of the source tree.
11
%include "vpx_ports/x86_abi_support.asm"
13
%macro STACK_FRAME_CREATE_X3 0
16
%define src_stride rax
18
%define ref_stride rdx
21
%define result_ptr arg(4)
22
%define max_sad arg(4)
23
%define height dword ptr arg(4)
30
mov rsi, arg(0) ; src_ptr
31
mov rdi, arg(2) ; ref_ptr
33
movsxd rax, dword ptr arg(1) ; src_stride
34
movsxd rdx, dword ptr arg(3) ; ref_stride
39
%define src_stride rdx
44
%define result_ptr [rsp+xmm_stack_space+8+4*8]
45
%define max_sad [rsp+xmm_stack_space+8+4*8]
46
%define height dword ptr [rsp+xmm_stack_space+8+4*8]
49
%define src_stride rsi
51
%define ref_stride rcx
62
%macro STACK_FRAME_DESTROY_X3 0
86
%macro STACK_FRAME_CREATE_X4 0
89
%define src_stride rax
94
%define ref_stride rbp
95
%define result_ptr arg(4)
103
mov rdi, arg(2) ; ref_ptr_base
105
LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
107
mov rsi, arg(0) ; src_ptr
109
movsxd rbx, dword ptr arg(1) ; src_stride
110
movsxd rbp, dword ptr arg(3) ; ref_stride
114
%if LIBVPX_YASM_WIN64
117
%define src_stride rdx
122
%define ref_stride r9
123
%define result_ptr [rsp+xmm_stack_space+16+4*8]
126
LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr
129
%define src_stride rsi
134
%define ref_stride rcx
135
%define result_ptr r8
137
LOAD_X4_ADDRESSES rdx, r0_ptr, r1_ptr, r2_ptr, r3_ptr
143
%macro STACK_FRAME_DESTROY_X4 0
159
%if LIBVPX_YASM_WIN64
167
%macro PROCESS_16X2X3 5
169
movdqa xmm0, XMMWORD PTR [%2]
170
lddqu xmm5, XMMWORD PTR [%3]
171
lddqu xmm6, XMMWORD PTR [%3+1]
172
lddqu xmm7, XMMWORD PTR [%3+2]
178
movdqa xmm0, XMMWORD PTR [%2]
179
lddqu xmm1, XMMWORD PTR [%3]
180
lddqu xmm2, XMMWORD PTR [%3+1]
181
lddqu xmm3, XMMWORD PTR [%3+2]
191
movdqa xmm0, XMMWORD PTR [%2+%4]
192
lddqu xmm1, XMMWORD PTR [%3+%5]
193
lddqu xmm2, XMMWORD PTR [%3+%5+1]
194
lddqu xmm3, XMMWORD PTR [%3+%5+2]
210
%macro PROCESS_8X2X3 5
212
movq mm0, QWORD PTR [%2]
213
movq mm5, QWORD PTR [%3]
214
movq mm6, QWORD PTR [%3+1]
215
movq mm7, QWORD PTR [%3+2]
221
movq mm0, QWORD PTR [%2]
222
movq mm1, QWORD PTR [%3]
223
movq mm2, QWORD PTR [%3+1]
224
movq mm3, QWORD PTR [%3+2]
234
movq mm0, QWORD PTR [%2+%4]
235
movq mm1, QWORD PTR [%3+%5]
236
movq mm2, QWORD PTR [%3+%5+1]
237
movq mm3, QWORD PTR [%3+%5+2]
253
%macro LOAD_X4_ADDRESSES 5
254
mov %2, [%1+REG_SZ_BYTES*0]
255
mov %3, [%1+REG_SZ_BYTES*1]
257
mov %4, [%1+REG_SZ_BYTES*2]
258
mov %5, [%1+REG_SZ_BYTES*3]
261
%macro PROCESS_16X2X4 8
263
movdqa xmm0, XMMWORD PTR [%2]
264
lddqu xmm4, XMMWORD PTR [%3]
265
lddqu xmm5, XMMWORD PTR [%4]
266
lddqu xmm6, XMMWORD PTR [%5]
267
lddqu xmm7, XMMWORD PTR [%6]
274
movdqa xmm0, XMMWORD PTR [%2]
275
lddqu xmm1, XMMWORD PTR [%3]
276
lddqu xmm2, XMMWORD PTR [%4]
277
lddqu xmm3, XMMWORD PTR [%5]
284
lddqu xmm1, XMMWORD PTR [%6]
291
movdqa xmm0, XMMWORD PTR [%2+%7]
292
lddqu xmm1, XMMWORD PTR [%3+%8]
293
lddqu xmm2, XMMWORD PTR [%4+%8]
294
lddqu xmm3, XMMWORD PTR [%5+%8]
301
lddqu xmm1, XMMWORD PTR [%6+%8]
319
%macro PROCESS_8X2X4 8
321
movq mm0, QWORD PTR [%2]
322
movq mm4, QWORD PTR [%3]
323
movq mm5, QWORD PTR [%4]
324
movq mm6, QWORD PTR [%5]
325
movq mm7, QWORD PTR [%6]
332
movq mm0, QWORD PTR [%2]
333
movq mm1, QWORD PTR [%3]
334
movq mm2, QWORD PTR [%4]
335
movq mm3, QWORD PTR [%5]
342
movq mm1, QWORD PTR [%6]
349
movq mm0, QWORD PTR [%2+%7]
350
movq mm1, QWORD PTR [%3+%8]
351
movq mm2, QWORD PTR [%4+%8]
352
movq mm3, QWORD PTR [%5+%8]
359
movq mm1, QWORD PTR [%6+%8]
377
;void int vp8_sad16x16x3_sse3(
378
; unsigned char *src_ptr,
380
; unsigned char *ref_ptr,
383
global sym(vp8_sad16x16x3_sse3) PRIVATE
384
sym(vp8_sad16x16x3_sse3):
386
STACK_FRAME_CREATE_X3
388
PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
389
PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
390
PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
391
PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
392
PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
393
PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
394
PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
395
PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
417
STACK_FRAME_DESTROY_X3
419
;void int vp8_sad16x8x3_sse3(
420
; unsigned char *src_ptr,
422
; unsigned char *ref_ptr,
425
global sym(vp8_sad16x8x3_sse3) PRIVATE
426
sym(vp8_sad16x8x3_sse3):
428
STACK_FRAME_CREATE_X3
430
PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
431
PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
432
PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
433
PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
455
STACK_FRAME_DESTROY_X3
457
;void int vp8_sad8x16x3_sse3(
458
; unsigned char *src_ptr,
460
; unsigned char *ref_ptr,
463
global sym(vp8_sad8x16x3_sse3) PRIVATE
464
sym(vp8_sad8x16x3_sse3):
466
STACK_FRAME_CREATE_X3
468
PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
469
PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
470
PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
471
PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
472
PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
473
PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
474
PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
475
PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
484
STACK_FRAME_DESTROY_X3
486
;void int vp8_sad8x8x3_sse3(
487
; unsigned char *src_ptr,
489
; unsigned char *ref_ptr,
492
global sym(vp8_sad8x8x3_sse3) PRIVATE
493
sym(vp8_sad8x8x3_sse3):
495
STACK_FRAME_CREATE_X3
497
PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
498
PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
499
PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
500
PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
509
STACK_FRAME_DESTROY_X3
511
;void int vp8_sad4x4x3_sse3(
512
; unsigned char *src_ptr,
514
; unsigned char *ref_ptr,
517
global sym(vp8_sad4x4x3_sse3) PRIVATE
518
sym(vp8_sad4x4x3_sse3):
520
STACK_FRAME_CREATE_X3
522
movd mm0, DWORD PTR [src_ptr]
523
movd mm1, DWORD PTR [ref_ptr]
525
movd mm2, DWORD PTR [src_ptr+src_stride]
526
movd mm3, DWORD PTR [ref_ptr+ref_stride]
531
movd mm4, DWORD PTR [ref_ptr+1]
532
movd mm5, DWORD PTR [ref_ptr+2]
534
movd mm2, DWORD PTR [ref_ptr+ref_stride+1]
535
movd mm3, DWORD PTR [ref_ptr+ref_stride+2]
545
lea src_ptr, [src_ptr+src_stride*2]
546
lea ref_ptr, [ref_ptr+ref_stride*2]
548
movd mm0, DWORD PTR [src_ptr]
549
movd mm2, DWORD PTR [ref_ptr]
551
movd mm3, DWORD PTR [src_ptr+src_stride]
552
movd mm6, DWORD PTR [ref_ptr+ref_stride]
557
movd mm3, DWORD PTR [ref_ptr+1]
558
movd mm7, DWORD PTR [ref_ptr+2]
564
movd mm2, DWORD PTR [ref_ptr+ref_stride+1]
565
movd mm6, DWORD PTR [ref_ptr+ref_stride+2]
583
STACK_FRAME_DESTROY_X3
585
;unsigned int vp8_sad16x16_sse3(
586
; unsigned char *src_ptr,
588
; unsigned char *ref_ptr,
591
;%define lddqu movdqu
592
global sym(vp8_sad16x16_sse3) PRIVATE
593
sym(vp8_sad16x16_sse3):
595
STACK_FRAME_CREATE_X3
600
.vp8_sad16x16_sse3_loop:
601
movdqa xmm0, XMMWORD PTR [src_ptr]
602
movdqu xmm1, XMMWORD PTR [ref_ptr]
603
movdqa xmm2, XMMWORD PTR [src_ptr+src_stride]
604
movdqu xmm3, XMMWORD PTR [ref_ptr+ref_stride]
606
lea src_ptr, [src_ptr+src_stride*2]
607
lea ref_ptr, [ref_ptr+ref_stride*2]
609
movdqa xmm4, XMMWORD PTR [src_ptr]
610
movdqu xmm5, XMMWORD PTR [ref_ptr]
611
movdqa xmm6, XMMWORD PTR [src_ptr+src_stride]
615
movdqu xmm1, XMMWORD PTR [ref_ptr+ref_stride]
621
lea src_ptr, [src_ptr+src_stride*2]
622
lea ref_ptr, [ref_ptr+ref_stride*2]
630
jne .vp8_sad16x16_sse3_loop
637
STACK_FRAME_DESTROY_X3
639
;void vp8_copy32xn_sse3(
640
; unsigned char *src_ptr,
642
; unsigned char *dst_ptr,
645
global sym(vp8_copy32xn_sse3) PRIVATE
646
sym(vp8_copy32xn_sse3):
648
STACK_FRAME_CREATE_X3
650
.block_copy_sse3_loopx4:
651
lea end_ptr, [src_ptr+src_stride*2]
653
movdqu xmm0, XMMWORD PTR [src_ptr]
654
movdqu xmm1, XMMWORD PTR [src_ptr + 16]
655
movdqu xmm2, XMMWORD PTR [src_ptr + src_stride]
656
movdqu xmm3, XMMWORD PTR [src_ptr + src_stride + 16]
657
movdqu xmm4, XMMWORD PTR [end_ptr]
658
movdqu xmm5, XMMWORD PTR [end_ptr + 16]
659
movdqu xmm6, XMMWORD PTR [end_ptr + src_stride]
660
movdqu xmm7, XMMWORD PTR [end_ptr + src_stride + 16]
662
lea src_ptr, [src_ptr+src_stride*4]
664
lea end_ptr, [ref_ptr+ref_stride*2]
666
movdqa XMMWORD PTR [ref_ptr], xmm0
667
movdqa XMMWORD PTR [ref_ptr + 16], xmm1
668
movdqa XMMWORD PTR [ref_ptr + ref_stride], xmm2
669
movdqa XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3
670
movdqa XMMWORD PTR [end_ptr], xmm4
671
movdqa XMMWORD PTR [end_ptr + 16], xmm5
672
movdqa XMMWORD PTR [end_ptr + ref_stride], xmm6
673
movdqa XMMWORD PTR [end_ptr + ref_stride + 16], xmm7
675
lea ref_ptr, [ref_ptr+ref_stride*4]
679
jge .block_copy_sse3_loopx4
681
;Check to see if there is more rows need to be copied.
685
.block_copy_sse3_loop:
686
movdqu xmm0, XMMWORD PTR [src_ptr]
687
movdqu xmm1, XMMWORD PTR [src_ptr + 16]
688
lea src_ptr, [src_ptr+src_stride]
690
movdqa XMMWORD PTR [ref_ptr], xmm0
691
movdqa XMMWORD PTR [ref_ptr + 16], xmm1
692
lea ref_ptr, [ref_ptr+ref_stride]
695
jne .block_copy_sse3_loop
698
STACK_FRAME_DESTROY_X3
700
;void vp8_sad16x16x4d_sse3(
701
; unsigned char *src_ptr,
703
; unsigned char *ref_ptr_base,
706
global sym(vp8_sad16x16x4d_sse3) PRIVATE
707
sym(vp8_sad16x16x4d_sse3):
709
STACK_FRAME_CREATE_X4
711
PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
712
PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
713
PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
714
PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
715
PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
716
PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
717
PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
718
PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
749
STACK_FRAME_DESTROY_X4
751
;void vp8_sad16x8x4d_sse3(
752
; unsigned char *src_ptr,
754
; unsigned char *ref_ptr_base,
757
global sym(vp8_sad16x8x4d_sse3) PRIVATE
758
sym(vp8_sad16x8x4d_sse3):
760
STACK_FRAME_CREATE_X4
762
PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
763
PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
764
PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
765
PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
796
STACK_FRAME_DESTROY_X4
798
;void int vp8_sad8x16x4d_sse3(
799
; unsigned char *src_ptr,
801
; unsigned char *ref_ptr,
804
global sym(vp8_sad8x16x4d_sse3) PRIVATE
805
sym(vp8_sad8x16x4d_sse3):
807
STACK_FRAME_CREATE_X4
809
PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
810
PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
811
PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
812
PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
813
PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
814
PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
815
PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
816
PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
829
STACK_FRAME_DESTROY_X4
831
;void int vp8_sad8x8x4d_sse3(
832
; unsigned char *src_ptr,
834
; unsigned char *ref_ptr,
837
global sym(vp8_sad8x8x4d_sse3) PRIVATE
838
sym(vp8_sad8x8x4d_sse3):
840
STACK_FRAME_CREATE_X4
842
PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
843
PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
844
PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
845
PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
858
STACK_FRAME_DESTROY_X4
860
;void int vp8_sad4x4x4d_sse3(
861
; unsigned char *src_ptr,
863
; unsigned char *ref_ptr,
866
global sym(vp8_sad4x4x4d_sse3) PRIVATE
867
sym(vp8_sad4x4x4d_sse3):
869
STACK_FRAME_CREATE_X4
871
movd mm0, DWORD PTR [src_ptr]
872
movd mm1, DWORD PTR [r0_ptr]
874
movd mm2, DWORD PTR [src_ptr+src_stride]
875
movd mm3, DWORD PTR [r0_ptr+ref_stride]
880
movd mm4, DWORD PTR [r1_ptr]
881
movd mm5, DWORD PTR [r2_ptr]
883
movd mm6, DWORD PTR [r3_ptr]
884
movd mm2, DWORD PTR [r1_ptr+ref_stride]
886
movd mm3, DWORD PTR [r2_ptr+ref_stride]
887
movd mm7, DWORD PTR [r3_ptr+ref_stride]
902
lea src_ptr, [src_ptr+src_stride*2]
903
lea r0_ptr, [r0_ptr+ref_stride*2]
905
lea r1_ptr, [r1_ptr+ref_stride*2]
906
lea r2_ptr, [r2_ptr+ref_stride*2]
908
lea r3_ptr, [r3_ptr+ref_stride*2]
910
movd mm0, DWORD PTR [src_ptr]
911
movd mm2, DWORD PTR [r0_ptr]
913
movd mm3, DWORD PTR [src_ptr+src_stride]
914
movd mm7, DWORD PTR [r0_ptr+ref_stride]
919
movd mm3, DWORD PTR [r1_ptr]
920
movd mm7, DWORD PTR [r2_ptr]
927
%define ref_stride rax
934
movd mm2, DWORD PTR [r1_ptr+ref_stride]
935
movd mm1, DWORD PTR [r2_ptr+ref_stride]
943
movd mm2, DWORD PTR [r3_ptr]
944
movd mm1, DWORD PTR [r3_ptr+ref_stride]
959
STACK_FRAME_DESTROY_X4