1
;*****************************************************************************
2
;* mc-a2.asm: h264 encoder library
3
;*****************************************************************************
4
;* Copyright (C) 2005 x264 project
6
;* This program is free software; you can redistribute it and/or modify
7
;* it under the terms of the GNU General Public License as published by
8
;* the Free Software Foundation; either version 2 of the License, or
9
;* (at your option) any later version.
11
;* This program is distributed in the hope that it will be useful,
12
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
13
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
;* GNU General Public License for more details.
16
;* You should have received a copy of the GNU General Public License
17
;* along with this program; if not, write to the Free Software
18
;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
19
;*****************************************************************************
23
;=============================================================================
24
; Macros and other preprocessor constants
25
;=============================================================================
27
%include "amd64inc.asm"
29
;=============================================================================
31
;=============================================================================
47
;=============================================================================
49
;=============================================================================
86
LOAD_4 mm1, mm2, mm3, mm4, [%1], [%1 + rcx], [%1 + 2 * rcx], [%1 + rbx], mm0
88
movd mm5, [%1 + 4 * rcx]
102
;=============================================================================
104
;=============================================================================
108
cglobal x264_horizontal_filter_mmxext
109
cglobal x264_center_filter_mmxext
111
;-----------------------------------------------------------------------------
113
; void x264_center_filter_mmxext( uint8_t *dst1, int i_dst1_stride,
114
; uint8_t *dst2, int i_dst2_stride,
115
; uint8_t *src, int i_src_stride,
116
; int i_width, int i_height );
118
;-----------------------------------------------------------------------------
121
x264_center_filter_mmxext :
147
movsxd r13, dword [rsp+64+48] ; src_stride
148
mov r12, [rsp+64+40] ; src
150
movsxd r13, r9d ; src_stride
154
sub r12, r13 ; tsrc = src - 2 * src_stride
156
; use 24 instead of 18 (used in i386/mc-a2.asm) to keep rsp aligned
157
lea rax, [r13 + r13 + 24 + tbuffer]
160
mov r10, parm3q ; dst2
161
movsxd r11, parm4d ; dst2_stride
162
mov r8, parm1q ; dst1
163
movsxd r9, parm2d ; dst1_stride
165
movsxd r14, dword [rbp + 64 + 56] ; width
166
movsxd r15, dword [rbp + 64 + 64] ; height
168
movsxd r14, dword [rbp + 56] ; width
169
movsxd r15, dword [rbp + 64] ; height
172
mov rcx, r13 ; src_stride
173
lea rbx, [r13 + r13 * 2] ; 3 * src_stride
174
lea rdx, [r13 + r13 * 4] ; 5 * src_stride
176
pxor mm0, mm0 ; 0 ---> mm0
177
movq mm7, [mmx_dd_one GLOBAL] ; for rounding
187
movq [rsp + tbuffer], mm2
188
movq [rsp + tbuffer + 8], mm1
189
paddw mm1, [mmx_dw_one GLOBAL]
193
movd [r8], mm1 ; dst1[0] = mm1
197
lea rdi, [r8 - 4] ; rdi = dst1 - 4
203
movq [rsp + tbuffer + 2 * rax], mm1
204
paddw mm1, [mmx_dw_one GLOBAL]
207
movd [rdi + rax], mm1 ; dst1[rax - 4] = mm1
211
cmp rax, r14 ; cmp rax, width
217
movq [rsp + tbuffer + 2 * rax], mm1
218
movq [rsp + tbuffer + 2 * rax + 8], mm2
219
paddw mm1, [mmx_dw_one GLOBAL]
222
movd [rdi + rax], mm1 ; dst1[rax - 4] = mm1
224
add r12, r13 ; tsrc = tsrc + src_stride
226
add r8, r9 ; dst1 = dst1 + dst1_stride
232
movq mm2, [rsp + 2 * rax + 2 + 4 + tbuffer]
233
movq mm3, [rsp + 2 * rax + 4 + 4 + tbuffer]
234
movq mm4, [rsp + 2 * rax + 6 + 4 + tbuffer]
235
movq mm5, [rsp + 2 * rax + 8 + 4 + tbuffer]
236
movq mm1, [rsp + 2 * rax + 4 + tbuffer]
237
movq mm6, [rsp + 2 * rax + 10 + 4 + tbuffer]
242
movq mm5, [mmx_dw_20 GLOBAL]
243
movq mm4, [mmx_dw_5 GLOBAL]
249
punpcklwd mm2, [mmx_dw_20 GLOBAL]
250
punpckhwd mm3, [mmx_dw_5 GLOBAL]
263
paddd mm2, [mmx_dd_one GLOBAL]
264
paddd mm3, [mmx_dd_one GLOBAL]
272
movd [r10 + rax], mm2 ; dst2[rax] = mm2
275
cmp rax, r14 ; cmp rax, width
278
add r10, r11 ; dst2 += dst2_stride
299
;-----------------------------------------------------------------------------
301
; void x264_horizontal_filter_mmxext( uint8_t *dst, int i_dst_stride,
302
; uint8_t *src, int i_src_stride,
303
; int i_width, int i_height );
305
;-----------------------------------------------------------------------------
308
x264_horizontal_filter_mmxext :
309
movsxd r10, parm2d ; dst_stride
310
movsxd r11, parm4d ; src_stride
314
movsxd rcx, parm6d ; height
316
movsxd rcx, parm6d ; height
320
movsxd r8, parm5d ; width
323
movq mm7, [mmx_dw_one GLOBAL]
334
prefetchnta [rdx + rax + 48]
336
LOAD_4 mm1, mm2, mm3, mm4, [rdx + rax], [rdx + rax + 1], [rdx + rax + 2], [rdx + rax + 3], mm0
338
movd mm5, [rdx + rax + 4]
339
movd mm6, [rdx + rax + 5]
341
movd mm2, [rdx + rax + 4]
342
movd mm3, [rdx + rax + 6]
345
FILT_6 mm1, mm5, mm6, mm7
346
movd mm4, [rdx + rax + 7]
347
movd mm5, [rdx + rax + 8]
349
punpcklbw mm3, mm0 ; mm2(1), mm3(20), mm6(-5) ready
351
movd mm6, [rdx + rax + 9]
353
punpcklbw mm5, mm0 ; mm2(1-5), mm3(20), mm4(20), mm5(-5) ready
356
FILT_6 mm2, mm5, mm6, mm7
362
cmp rax, r8 ; cmp rax, width
365
add rdx, r11 ; src_pitch
366
add r9, r10 ; dst_pitch