1
;*****************************************************************************
3
;*****************************************************************************
4
;* Copyright (C) 2005-2011 x264 project
6
;* Authors: Loren Merritt <lorenm@u.washington.edu>
7
;* Anton Mitrofanov <BugMaster@narod.ru>
8
;* Jason Garrett-Glaser <darkshikari@gmail.com>
10
;* Permission to use, copy, modify, and/or distribute this software for any
11
;* purpose with or without fee is hereby granted, provided that the above
12
;* copyright notice and this permission notice appear in all copies.
14
;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
15
;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
16
;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
17
;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
18
;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
19
;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
20
;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
21
;*****************************************************************************
23
; This is a header file for the x264ASM assembly language, which uses
24
; NASM/YASM syntax combined with a large number of macros to provide easy
25
; abstraction between different calling conventions (x86_32, win64, linux64).
26
; It also has various other useful features to simplify writing the kind of
27
; DSP functions that are most often used in x264.
29
; Unlike the rest of x264, this file is available under an ISC license, as it
30
; has significant usefulness outside of x264 and we want it to be available
31
; to the largest audience possible. Of course, if you modify it for your own
32
; purposes to add a new feature, we strongly encourage contributing a patch
33
; as this feature might be useful for others as well. Send patches or ideas
34
; to x264-devel@videolan.org .
36
%define program_name ff
39
%ifidn __OUTPUT_FORMAT__,win32
47
%define mangle(x) _ %+ x
52
; FIXME: All of the 64bit asm functions that take a stride as an argument
53
; via register, assume that the high dword of that register is filled with 0.
54
; This is true in practice (since we never do any 64bit arithmetic on strides,
55
; and x264's strides are all positive), but is not guaranteed by the ABI.
57
; Name of the .rodata section.
58
; Kludge: Something on OS X fails to align .rodata even given an align attribute,
59
; so use a different read-only section.
60
%macro SECTION_RODATA 0-1 16
61
%ifidn __OUTPUT_FORMAT__,macho64
62
SECTION .text align=%1
63
%elifidn __OUTPUT_FORMAT__,macho
64
SECTION .text align=%1
66
%elifidn __OUTPUT_FORMAT__,aout
69
SECTION .rodata align=%1
73
; aout does not support align=
74
%macro SECTION_TEXT 0-1 16
75
%ifidn __OUTPUT_FORMAT__,aout
78
SECTION .text align=%1
85
; x86_32 doesn't require PIC.
86
; Some distros prefer shared objects to be PIC, but nothing breaks if
87
; the code contains a few textrels, so we'll skip that complexity.
94
; Macros to eliminate most code duplication between x86_32 and x86_64:
95
; Currently this works only for leaf functions which load all their arguments
96
; into registers at the start, and make no other use of the stack. Luckily that
97
; covers most of x264's asm.
100
; %1 = number of arguments. loads them from stack if needed.
101
; %2 = number of registers used. pushes callee-saved regs if needed.
102
; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
103
; %4 = list of names to define to registers
104
; PROLOGUE can also be invoked by adding the same options to cglobal
107
; cglobal foo, 2,3,0, dst, src, tmp
108
; declares a function (foo), taking two args (dst and src) and one local variable (tmp)
110
; TODO Some functions can use some args directly from the stack. If they're the
111
; last args then you can just not declare them, but if they're in the middle
112
; we need more flexible macro.
115
; Pops anything that was pushed by PROLOGUE
118
; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons
119
; which are slow when a normal ret follows a branch.
122
; rN and rNq are the native-size register holding function argument N
123
; rNd, rNw, rNb are dword, word, and byte size
124
; rNm is the original location of arg N (a register or on the stack), dword
125
; rNmp is native size
133
%ifid %6 ; i.e. it's a register
135
%elifdef ARCH_X86_64 ; memory
136
%define r%1mp qword %6
138
%define r%1mp dword %6
143
%macro DECLARE_REG_SIZE 2
157
DECLARE_REG_SIZE ax, al
158
DECLARE_REG_SIZE bx, bl
159
DECLARE_REG_SIZE cx, cl
160
DECLARE_REG_SIZE dx, dl
161
DECLARE_REG_SIZE si, sil
162
DECLARE_REG_SIZE di, dil
163
DECLARE_REG_SIZE bp, bpl
165
; t# defines for when per-arch register allocation is more complex than just function arguments
167
%macro DECLARE_REG_TMP 1-*
170
CAT_XDEFINE t, %%i, r%1
176
%macro DECLARE_REG_TMP_SIZE 0-*
178
%define t%1q t%1 %+ q
179
%define t%1d t%1 %+ d
180
%define t%1w t%1 %+ w
181
%define t%1b t%1 %+ b
186
DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9
196
%assign stack_offset stack_offset+gprsize
201
%assign stack_offset stack_offset-gprsize
207
%assign stack_offset stack_offset+(%2)
214
%assign stack_offset stack_offset-(%2)
224
%macro movsxdifnidn 2
236
%macro DEFINE_ARGS 0-*
240
CAT_UNDEF arg_name %+ %%i, q
241
CAT_UNDEF arg_name %+ %%i, d
242
CAT_UNDEF arg_name %+ %%i, w
243
CAT_UNDEF arg_name %+ %%i, b
244
CAT_UNDEF arg_name %+ %%i, m
245
CAT_UNDEF arg_name, %%i
252
%xdefine %1q r %+ %%i %+ q
253
%xdefine %1d r %+ %%i %+ d
254
%xdefine %1w r %+ %%i %+ w
255
%xdefine %1b r %+ %%i %+ b
256
%xdefine %1m r %+ %%i %+ m
257
CAT_XDEFINE arg_name, %%i, %1
261
%assign n_arg_names %%i
264
%ifdef WIN64 ; Windows x64 ;=================================================
266
DECLARE_REG 0, rcx, ecx, cx, cl, ecx
267
DECLARE_REG 1, rdx, edx, dx, dl, edx
268
DECLARE_REG 2, r8, r8d, r8w, r8b, r8d
269
DECLARE_REG 3, r9, r9d, r9w, r9b, r9d
270
DECLARE_REG 4, rdi, edi, di, dil, [rsp + stack_offset + 40]
271
DECLARE_REG 5, rsi, esi, si, sil, [rsp + stack_offset + 48]
272
DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56]
273
%define r7m [rsp + stack_offset + 64]
274
%define r8m [rsp + stack_offset + 72]
276
%macro LOAD_IF_USED 2 ; reg_id, number_of_args
278
mov r%1, [rsp + stack_offset + 8 + %1*8]
282
%macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names...
285
ASSERT regs_used <= 7
289
%assign stack_offset stack_offset+16
298
%macro WIN64_SPILL_XMM 1
299
%assign xmm_regs_used %1
300
ASSERT xmm_regs_used <= 16
301
%if xmm_regs_used > 6
302
sub rsp, (xmm_regs_used-6)*16+16
303
%assign stack_offset stack_offset+(xmm_regs_used-6)*16+16
304
%assign %%i xmm_regs_used
305
%rep (xmm_regs_used-6)
307
movdqa [rsp + (%%i-6)*16+8], xmm %+ %%i
312
%macro WIN64_RESTORE_XMM_INTERNAL 1
313
%if xmm_regs_used > 6
314
%assign %%i xmm_regs_used
315
%rep (xmm_regs_used-6)
317
movdqa xmm %+ %%i, [%1 + (%%i-6)*16+8]
319
add %1, (xmm_regs_used-6)*16+16
323
%macro WIN64_RESTORE_XMM 1
324
WIN64_RESTORE_XMM_INTERNAL %1
325
%assign stack_offset stack_offset-(xmm_regs_used-6)*16+16
326
%assign xmm_regs_used 0
330
WIN64_RESTORE_XMM_INTERNAL rsp
339
%if regs_used > 4 || xmm_regs_used > 6
346
%elifdef ARCH_X86_64 ; *nix x64 ;=============================================
348
DECLARE_REG 0, rdi, edi, di, dil, edi
349
DECLARE_REG 1, rsi, esi, si, sil, esi
350
DECLARE_REG 2, rdx, edx, dx, dl, edx
351
DECLARE_REG 3, rcx, ecx, cx, cl, ecx
352
DECLARE_REG 4, r8, r8d, r8w, r8b, r8d
353
DECLARE_REG 5, r9, r9d, r9w, r9b, r9d
354
DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 8]
355
%define r7m [rsp + stack_offset + 16]
356
%define r8m [rsp + stack_offset + 24]
358
%macro LOAD_IF_USED 2 ; reg_id, number_of_args
360
mov r%1, [rsp - 40 + %1*8]
364
%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
379
%else ; X86_32 ;==============================================================
381
DECLARE_REG 0, eax, eax, ax, al, [esp + stack_offset + 4]
382
DECLARE_REG 1, ecx, ecx, cx, cl, [esp + stack_offset + 8]
383
DECLARE_REG 2, edx, edx, dx, dl, [esp + stack_offset + 12]
384
DECLARE_REG 3, ebx, ebx, bx, bl, [esp + stack_offset + 16]
385
DECLARE_REG 4, esi, esi, si, null, [esp + stack_offset + 20]
386
DECLARE_REG 5, edi, edi, di, null, [esp + stack_offset + 24]
387
DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28]
388
%define r7m [esp + stack_offset + 32]
389
%define r8m [esp + stack_offset + 36]
392
%macro PUSH_IF_USED 1 ; reg_id
395
%assign stack_offset stack_offset+4
399
%macro POP_IF_USED 1 ; reg_id
405
%macro LOAD_IF_USED 2 ; reg_id, number_of_args
407
mov r%1, [esp + stack_offset + 4 + %1*4]
411
%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
414
ASSERT regs_used <= 7
445
%endif ;======================================================================
448
%macro WIN64_SPILL_XMM 1
450
%macro WIN64_RESTORE_XMM 1
456
;=============================================================================
457
; arch-independent part
458
;=============================================================================
460
%assign function_align 16
462
; Symbol prefix for C linkage
464
%xdefine %1 mangle(program_name %+ _ %+ %1)
465
%xdefine %1.skip_prologue %1 %+ .skip_prologue
466
%ifidn __OUTPUT_FORMAT__,elf
467
global %1:function hidden
473
RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
474
%assign stack_offset 0
481
%xdefine %1 mangle(program_name %+ _ %+ %1)
485
;like cextern, but without the prefix
486
%macro cextern_naked 1
487
%xdefine %1 mangle(%1)
492
%xdefine %1 mangle(program_name %+ _ %+ %1)
497
; This is needed for ELF, otherwise the GNU linker assumes the stack is
498
; executable by default.
499
%ifidn __OUTPUT_FORMAT__,elf
500
SECTION .note.GNU-stack noalloc noexec nowrite progbits
514
%assign avx_enabled 0
515
%define RESET_MM_PERMUTATION INIT_MMX
521
%define movnta movntq
524
CAT_XDEFINE m, %%i, mm %+ %%i
525
CAT_XDEFINE nmm, %%i, %%i
536
%assign avx_enabled 0
537
%define RESET_MM_PERMUTATION INIT_XMM
541
%define num_mmregs 16
546
%define movnta movntdq
549
CAT_XDEFINE m, %%i, xmm %+ %%i
550
CAT_XDEFINE nxmm, %%i, %%i
557
%assign avx_enabled 1
558
%define PALIGNR PALIGNR_SSSE3
559
%define RESET_MM_PERMUTATION INIT_AVX
563
%assign avx_enabled 1
564
%define RESET_MM_PERMUTATION INIT_YMM
568
%define num_mmregs 16
574
CAT_XDEFINE m, %%i, ymm %+ %%i
575
CAT_XDEFINE nymm, %%i, %%i
582
; I often want to use macros that permute their arguments. e.g. there's no
583
; efficient way to implement butterfly or transpose or dct without swapping some
586
; I would like to not have to manually keep track of the permutations:
587
; If I insert a permutation in the middle of a function, it should automatically
588
; change everything that follows. For more complex macros I may also have multiple
589
; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
591
; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
592
; permutes its arguments. It's equivalent to exchanging the contents of the
593
; registers, except that this way you exchange the register names instead, so it
594
; doesn't cost any cycles.
596
%macro PERMUTE 2-* ; takes a list of pairs to swap
611
%macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs)
617
CAT_XDEFINE n, m%1, %1
618
CAT_XDEFINE n, m%2, %2
620
; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here.
621
; Be careful using this mode in nested macros though, as in some cases there may be
622
; other copies of m# that have already been dereferenced and don't get updated correctly.
623
%xdefine %%n1 n %+ %1
624
%xdefine %%n2 n %+ %2
625
%xdefine tmp m %+ %%n1
626
CAT_XDEFINE m, %%n1, m %+ %%n2
627
CAT_XDEFINE m, %%n2, tmp
628
CAT_XDEFINE n, m %+ %%n1, %%n1
629
CAT_XDEFINE n, m %+ %%n2, %%n2
636
; If SAVE_MM_PERMUTATION is placed at the end of a function and given the
637
; function name, then any later calls to that function will automatically
638
; load the permutation, so values can be returned in mmregs.
639
%macro SAVE_MM_PERMUTATION 1 ; name to save as
642
CAT_XDEFINE %1_m, %%i, m %+ %%i
647
%macro LOAD_MM_PERMUTATION 1 ; name to load from
650
CAT_XDEFINE m, %%i, %1_m %+ %%i
651
CAT_XDEFINE n, m %+ %%i, %%i
659
LOAD_MM_PERMUTATION %1
663
; Substitutions that reduce instruction size but are functionally equivalent
688
;=============================================================================
689
; AVX abstraction layer
690
;=============================================================================
695
CAT_XDEFINE sizeofmm, i, 8
697
CAT_XDEFINE sizeofxmm, i, 16
698
CAT_XDEFINE sizeofymm, i, 32
704
;%2 == 1 if float, 0 if int
705
;%3 == 0 if 3-operand (xmm, xmm, xmm), 1 if 4-operand (xmm, xmm, xmm, imm)
706
;%4 == number of operands given
708
%macro RUN_AVX_INSTR 6-7+
713
%define %%regmov movq
715
%define %%regmov movaps
717
%define %%regmov movdqa
722
%if avx_enabled && sizeof%5==16
740
;%2 == 1 if float, 0 if int
741
;%3 == 0 if 3-operand (xmm, xmm, xmm), 1 if 4-operand (xmm, xmm, xmm, imm)
743
%macro %1 2-8 fnord, fnord, fnord, %1, %2, %3
745
RUN_AVX_INSTR %6, %7, %8, 2, %1, %2
747
RUN_AVX_INSTR %6, %7, %8, 3, %1, %2, %3
749
RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4
751
RUN_AVX_INSTR %6, %7, %8, 5, %1, %2, %3, %4, %5
756
AVX_INSTR addpd, 1, 0
757
AVX_INSTR addps, 1, 0
758
AVX_INSTR addsd, 1, 0
759
AVX_INSTR addss, 1, 0
760
AVX_INSTR addsubpd, 1, 0
761
AVX_INSTR addsubps, 1, 0
762
AVX_INSTR andpd, 1, 0
763
AVX_INSTR andps, 1, 0
764
AVX_INSTR andnpd, 1, 0
765
AVX_INSTR andnps, 1, 0
766
AVX_INSTR blendpd, 1, 0
767
AVX_INSTR blendps, 1, 0
768
AVX_INSTR blendvpd, 1, 0
769
AVX_INSTR blendvps, 1, 0
770
AVX_INSTR cmppd, 1, 0
771
AVX_INSTR cmpps, 1, 0
772
AVX_INSTR cmpsd, 1, 0
773
AVX_INSTR cmpss, 1, 0
774
AVX_INSTR divpd, 1, 0
775
AVX_INSTR divps, 1, 0
776
AVX_INSTR divsd, 1, 0
777
AVX_INSTR divss, 1, 0
780
AVX_INSTR haddpd, 1, 0
781
AVX_INSTR haddps, 1, 0
782
AVX_INSTR hsubpd, 1, 0
783
AVX_INSTR hsubps, 1, 0
784
AVX_INSTR maxpd, 1, 0
785
AVX_INSTR maxps, 1, 0
786
AVX_INSTR maxsd, 1, 0
787
AVX_INSTR maxss, 1, 0
788
AVX_INSTR minpd, 1, 0
789
AVX_INSTR minps, 1, 0
790
AVX_INSTR minsd, 1, 0
791
AVX_INSTR minss, 1, 0
792
AVX_INSTR mpsadbw, 0, 1
793
AVX_INSTR mulpd, 1, 0
794
AVX_INSTR mulps, 1, 0
795
AVX_INSTR mulsd, 1, 0
796
AVX_INSTR mulss, 1, 0
799
AVX_INSTR packsswb, 0, 0
800
AVX_INSTR packssdw, 0, 0
801
AVX_INSTR packuswb, 0, 0
802
AVX_INSTR packusdw, 0, 0
803
AVX_INSTR paddb, 0, 0
804
AVX_INSTR paddw, 0, 0
805
AVX_INSTR paddd, 0, 0
806
AVX_INSTR paddq, 0, 0
807
AVX_INSTR paddsb, 0, 0
808
AVX_INSTR paddsw, 0, 0
809
AVX_INSTR paddusb, 0, 0
810
AVX_INSTR paddusw, 0, 0
811
AVX_INSTR palignr, 0, 1
813
AVX_INSTR pandn, 0, 0
814
AVX_INSTR pavgb, 0, 0
815
AVX_INSTR pavgw, 0, 0
816
AVX_INSTR pblendvb, 0, 0
817
AVX_INSTR pblendw, 0, 1
818
AVX_INSTR pcmpestri, 0, 0
819
AVX_INSTR pcmpestrm, 0, 0
820
AVX_INSTR pcmpistri, 0, 0
821
AVX_INSTR pcmpistrm, 0, 0
822
AVX_INSTR pcmpeqb, 0, 0
823
AVX_INSTR pcmpeqw, 0, 0
824
AVX_INSTR pcmpeqd, 0, 0
825
AVX_INSTR pcmpeqq, 0, 0
826
AVX_INSTR pcmpgtb, 0, 0
827
AVX_INSTR pcmpgtw, 0, 0
828
AVX_INSTR pcmpgtd, 0, 0
829
AVX_INSTR pcmpgtq, 0, 0
830
AVX_INSTR phaddw, 0, 0
831
AVX_INSTR phaddd, 0, 0
832
AVX_INSTR phaddsw, 0, 0
833
AVX_INSTR phsubw, 0, 0
834
AVX_INSTR phsubd, 0, 0
835
AVX_INSTR phsubsw, 0, 0
836
AVX_INSTR pmaddwd, 0, 0
837
AVX_INSTR pmaddubsw, 0, 0
838
AVX_INSTR pmaxsb, 0, 0
839
AVX_INSTR pmaxsw, 0, 0
840
AVX_INSTR pmaxsd, 0, 0
841
AVX_INSTR pmaxub, 0, 0
842
AVX_INSTR pmaxuw, 0, 0
843
AVX_INSTR pmaxud, 0, 0
844
AVX_INSTR pminsb, 0, 0
845
AVX_INSTR pminsw, 0, 0
846
AVX_INSTR pminsd, 0, 0
847
AVX_INSTR pminub, 0, 0
848
AVX_INSTR pminuw, 0, 0
849
AVX_INSTR pminud, 0, 0
850
AVX_INSTR pmulhuw, 0, 0
851
AVX_INSTR pmulhrsw, 0, 0
852
AVX_INSTR pmulhw, 0, 0
853
AVX_INSTR pmullw, 0, 0
854
AVX_INSTR pmulld, 0, 0
855
AVX_INSTR pmuludq, 0, 0
856
AVX_INSTR pmuldq, 0, 0
858
AVX_INSTR psadbw, 0, 0
859
AVX_INSTR pshufb, 0, 0
860
AVX_INSTR psignb, 0, 0
861
AVX_INSTR psignw, 0, 0
862
AVX_INSTR psignd, 0, 0
863
AVX_INSTR psllw, 0, 0
864
AVX_INSTR pslld, 0, 0
865
AVX_INSTR psllq, 0, 0
866
AVX_INSTR pslldq, 0, 0
867
AVX_INSTR psraw, 0, 0
868
AVX_INSTR psrad, 0, 0
869
AVX_INSTR psrlw, 0, 0
870
AVX_INSTR psrld, 0, 0
871
AVX_INSTR psrlq, 0, 0
872
AVX_INSTR psrldq, 0, 0
873
AVX_INSTR psubb, 0, 0
874
AVX_INSTR psubw, 0, 0
875
AVX_INSTR psubd, 0, 0
876
AVX_INSTR psubq, 0, 0
877
AVX_INSTR psubsb, 0, 0
878
AVX_INSTR psubsw, 0, 0
879
AVX_INSTR psubusb, 0, 0
880
AVX_INSTR psubusw, 0, 0
881
AVX_INSTR punpckhbw, 0, 0
882
AVX_INSTR punpckhwd, 0, 0
883
AVX_INSTR punpckhdq, 0, 0
884
AVX_INSTR punpckhqdq, 0, 0
885
AVX_INSTR punpcklbw, 0, 0
886
AVX_INSTR punpcklwd, 0, 0
887
AVX_INSTR punpckldq, 0, 0
888
AVX_INSTR punpcklqdq, 0, 0
890
AVX_INSTR shufps, 0, 1
891
AVX_INSTR subpd, 1, 0
892
AVX_INSTR subps, 1, 0
893
AVX_INSTR subsd, 1, 0
894
AVX_INSTR subss, 1, 0
895
AVX_INSTR unpckhpd, 1, 0
896
AVX_INSTR unpckhps, 1, 0
897
AVX_INSTR unpcklpd, 1, 0
898
AVX_INSTR unpcklps, 1, 0
899
AVX_INSTR xorpd, 1, 0
900
AVX_INSTR xorps, 1, 0
902
; 3DNow instructions, for sharing code between AVX, SSE and 3DN
903
AVX_INSTR pfadd, 1, 0
904
AVX_INSTR pfsub, 1, 0
905
AVX_INSTR pfmul, 1, 0