2
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4
; Use of this source code is governed by a BSD-style license
5
; that can be found in the LICENSE file in the root of the source
6
; tree. An additional intellectual property rights grant can be found
7
; in the file PATENTS. All contributing project authors may
8
; be found in the AUTHORS file in the root of the source tree.
12
%include "vpx_ports/x86_abi_support.asm"
14
%macro PROCESS_16X2X3 1
16
movdqa xmm0, XMMWORD PTR [rsi]
17
lddqu xmm5, XMMWORD PTR [rdi]
18
lddqu xmm6, XMMWORD PTR [rdi+1]
19
lddqu xmm7, XMMWORD PTR [rdi+2]
25
movdqa xmm0, XMMWORD PTR [rsi]
26
lddqu xmm1, XMMWORD PTR [rdi]
27
lddqu xmm2, XMMWORD PTR [rdi+1]
28
lddqu xmm3, XMMWORD PTR [rdi+2]
38
movdqa xmm0, XMMWORD PTR [rsi+rax]
39
lddqu xmm1, XMMWORD PTR [rdi+rdx]
40
lddqu xmm2, XMMWORD PTR [rdi+rdx+1]
41
lddqu xmm3, XMMWORD PTR [rdi+rdx+2]
55
%macro PROCESS_16X2X3_OFFSET 2
57
movdqa xmm0, XMMWORD PTR [rsi]
58
movdqa xmm4, XMMWORD PTR [rdi]
59
movdqa xmm7, XMMWORD PTR [rdi+16]
62
palignr xmm5, xmm4, %2
65
palignr xmm6, xmm4, (%2+1)
67
palignr xmm7, xmm4, (%2+2)
73
movdqa xmm0, XMMWORD PTR [rsi]
74
movdqa xmm4, XMMWORD PTR [rdi]
75
movdqa xmm3, XMMWORD PTR [rdi+16]
78
palignr xmm1, xmm4, %2
81
palignr xmm2, xmm4, (%2+1)
83
palignr xmm3, xmm4, (%2+2)
93
movdqa xmm0, XMMWORD PTR [rsi+rax]
94
movdqa xmm4, XMMWORD PTR [rdi+rdx]
95
movdqa xmm3, XMMWORD PTR [rdi+rdx+16]
98
palignr xmm1, xmm4, %2
101
palignr xmm2, xmm4, (%2+1)
103
palignr xmm3, xmm4, (%2+2)
117
%macro PROCESS_16X16X3_OFFSET 2
122
PROCESS_16X2X3_OFFSET 1, %1
123
PROCESS_16X2X3_OFFSET 0, %1
124
PROCESS_16X2X3_OFFSET 0, %1
125
PROCESS_16X2X3_OFFSET 0, %1
126
PROCESS_16X2X3_OFFSET 0, %1
127
PROCESS_16X2X3_OFFSET 0, %1
128
PROCESS_16X2X3_OFFSET 0, %1
129
PROCESS_16X2X3_OFFSET 0, %1
135
%macro PROCESS_16X8X3_OFFSET 2
140
PROCESS_16X2X3_OFFSET 1, %1
141
PROCESS_16X2X3_OFFSET 0, %1
142
PROCESS_16X2X3_OFFSET 0, %1
143
PROCESS_16X2X3_OFFSET 0, %1
149
;void int vp8_sad16x16x3_ssse3(
150
; unsigned char *src_ptr,
152
; unsigned char *ref_ptr,
155
global sym(vp8_sad16x16x3_ssse3) PRIVATE
156
sym(vp8_sad16x16x3_ssse3):
159
SHADOW_ARGS_TO_STACK 5
166
mov rsi, arg(0) ;src_ptr
167
mov rdi, arg(2) ;ref_ptr
172
jmp .vp8_sad16x16x3_ssse3_skiptable
173
.vp8_sad16x16x3_ssse3_jumptable:
174
dd .vp8_sad16x16x3_ssse3_aligned_by_0 - .vp8_sad16x16x3_ssse3_do_jump
175
dd .vp8_sad16x16x3_ssse3_aligned_by_1 - .vp8_sad16x16x3_ssse3_do_jump
176
dd .vp8_sad16x16x3_ssse3_aligned_by_2 - .vp8_sad16x16x3_ssse3_do_jump
177
dd .vp8_sad16x16x3_ssse3_aligned_by_3 - .vp8_sad16x16x3_ssse3_do_jump
178
dd .vp8_sad16x16x3_ssse3_aligned_by_4 - .vp8_sad16x16x3_ssse3_do_jump
179
dd .vp8_sad16x16x3_ssse3_aligned_by_5 - .vp8_sad16x16x3_ssse3_do_jump
180
dd .vp8_sad16x16x3_ssse3_aligned_by_6 - .vp8_sad16x16x3_ssse3_do_jump
181
dd .vp8_sad16x16x3_ssse3_aligned_by_7 - .vp8_sad16x16x3_ssse3_do_jump
182
dd .vp8_sad16x16x3_ssse3_aligned_by_8 - .vp8_sad16x16x3_ssse3_do_jump
183
dd .vp8_sad16x16x3_ssse3_aligned_by_9 - .vp8_sad16x16x3_ssse3_do_jump
184
dd .vp8_sad16x16x3_ssse3_aligned_by_10 - .vp8_sad16x16x3_ssse3_do_jump
185
dd .vp8_sad16x16x3_ssse3_aligned_by_11 - .vp8_sad16x16x3_ssse3_do_jump
186
dd .vp8_sad16x16x3_ssse3_aligned_by_12 - .vp8_sad16x16x3_ssse3_do_jump
187
dd .vp8_sad16x16x3_ssse3_aligned_by_13 - .vp8_sad16x16x3_ssse3_do_jump
188
dd .vp8_sad16x16x3_ssse3_aligned_by_14 - .vp8_sad16x16x3_ssse3_do_jump
189
dd .vp8_sad16x16x3_ssse3_aligned_by_15 - .vp8_sad16x16x3_ssse3_do_jump
190
.vp8_sad16x16x3_ssse3_skiptable:
192
call .vp8_sad16x16x3_ssse3_do_jump
193
.vp8_sad16x16x3_ssse3_do_jump:
194
pop rcx ; get the address of do_jump
195
mov rax, .vp8_sad16x16x3_ssse3_jumptable - .vp8_sad16x16x3_ssse3_do_jump
196
add rax, rcx ; get the absolute address of vp8_sad16x16x3_ssse3_jumptable
198
movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable
201
movsxd rax, dword ptr arg(1) ;src_stride
202
movsxd rdx, dword ptr arg(3) ;ref_stride
206
PROCESS_16X16X3_OFFSET 0, .vp8_sad16x16x3_ssse3
207
PROCESS_16X16X3_OFFSET 1, .vp8_sad16x16x3_ssse3
208
PROCESS_16X16X3_OFFSET 2, .vp8_sad16x16x3_ssse3
209
PROCESS_16X16X3_OFFSET 3, .vp8_sad16x16x3_ssse3
210
PROCESS_16X16X3_OFFSET 4, .vp8_sad16x16x3_ssse3
211
PROCESS_16X16X3_OFFSET 5, .vp8_sad16x16x3_ssse3
212
PROCESS_16X16X3_OFFSET 6, .vp8_sad16x16x3_ssse3
213
PROCESS_16X16X3_OFFSET 7, .vp8_sad16x16x3_ssse3
214
PROCESS_16X16X3_OFFSET 8, .vp8_sad16x16x3_ssse3
215
PROCESS_16X16X3_OFFSET 9, .vp8_sad16x16x3_ssse3
216
PROCESS_16X16X3_OFFSET 10, .vp8_sad16x16x3_ssse3
217
PROCESS_16X16X3_OFFSET 11, .vp8_sad16x16x3_ssse3
218
PROCESS_16X16X3_OFFSET 12, .vp8_sad16x16x3_ssse3
219
PROCESS_16X16X3_OFFSET 13, .vp8_sad16x16x3_ssse3
220
PROCESS_16X16X3_OFFSET 14, .vp8_sad16x16x3_ssse3
222
.vp8_sad16x16x3_ssse3_aligned_by_15:
232
.vp8_sad16x16x3_ssse3_store_off:
233
mov rdi, arg(4) ;Results
262
;void int vp8_sad16x8x3_ssse3(
263
; unsigned char *src_ptr,
265
; unsigned char *ref_ptr,
268
global sym(vp8_sad16x8x3_ssse3) PRIVATE
269
sym(vp8_sad16x8x3_ssse3):
272
SHADOW_ARGS_TO_STACK 5
279
mov rsi, arg(0) ;src_ptr
280
mov rdi, arg(2) ;ref_ptr
285
jmp .vp8_sad16x8x3_ssse3_skiptable
286
.vp8_sad16x8x3_ssse3_jumptable:
287
dd .vp8_sad16x8x3_ssse3_aligned_by_0 - .vp8_sad16x8x3_ssse3_do_jump
288
dd .vp8_sad16x8x3_ssse3_aligned_by_1 - .vp8_sad16x8x3_ssse3_do_jump
289
dd .vp8_sad16x8x3_ssse3_aligned_by_2 - .vp8_sad16x8x3_ssse3_do_jump
290
dd .vp8_sad16x8x3_ssse3_aligned_by_3 - .vp8_sad16x8x3_ssse3_do_jump
291
dd .vp8_sad16x8x3_ssse3_aligned_by_4 - .vp8_sad16x8x3_ssse3_do_jump
292
dd .vp8_sad16x8x3_ssse3_aligned_by_5 - .vp8_sad16x8x3_ssse3_do_jump
293
dd .vp8_sad16x8x3_ssse3_aligned_by_6 - .vp8_sad16x8x3_ssse3_do_jump
294
dd .vp8_sad16x8x3_ssse3_aligned_by_7 - .vp8_sad16x8x3_ssse3_do_jump
295
dd .vp8_sad16x8x3_ssse3_aligned_by_8 - .vp8_sad16x8x3_ssse3_do_jump
296
dd .vp8_sad16x8x3_ssse3_aligned_by_9 - .vp8_sad16x8x3_ssse3_do_jump
297
dd .vp8_sad16x8x3_ssse3_aligned_by_10 - .vp8_sad16x8x3_ssse3_do_jump
298
dd .vp8_sad16x8x3_ssse3_aligned_by_11 - .vp8_sad16x8x3_ssse3_do_jump
299
dd .vp8_sad16x8x3_ssse3_aligned_by_12 - .vp8_sad16x8x3_ssse3_do_jump
300
dd .vp8_sad16x8x3_ssse3_aligned_by_13 - .vp8_sad16x8x3_ssse3_do_jump
301
dd .vp8_sad16x8x3_ssse3_aligned_by_14 - .vp8_sad16x8x3_ssse3_do_jump
302
dd .vp8_sad16x8x3_ssse3_aligned_by_15 - .vp8_sad16x8x3_ssse3_do_jump
303
.vp8_sad16x8x3_ssse3_skiptable:
305
call .vp8_sad16x8x3_ssse3_do_jump
306
.vp8_sad16x8x3_ssse3_do_jump:
307
pop rcx ; get the address of do_jump
308
mov rax, .vp8_sad16x8x3_ssse3_jumptable - .vp8_sad16x8x3_ssse3_do_jump
309
add rax, rcx ; get the absolute address of vp8_sad16x8x3_ssse3_jumptable
311
movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable
314
movsxd rax, dword ptr arg(1) ;src_stride
315
movsxd rdx, dword ptr arg(3) ;ref_stride
319
PROCESS_16X8X3_OFFSET 0, .vp8_sad16x8x3_ssse3
320
PROCESS_16X8X3_OFFSET 1, .vp8_sad16x8x3_ssse3
321
PROCESS_16X8X3_OFFSET 2, .vp8_sad16x8x3_ssse3
322
PROCESS_16X8X3_OFFSET 3, .vp8_sad16x8x3_ssse3
323
PROCESS_16X8X3_OFFSET 4, .vp8_sad16x8x3_ssse3
324
PROCESS_16X8X3_OFFSET 5, .vp8_sad16x8x3_ssse3
325
PROCESS_16X8X3_OFFSET 6, .vp8_sad16x8x3_ssse3
326
PROCESS_16X8X3_OFFSET 7, .vp8_sad16x8x3_ssse3
327
PROCESS_16X8X3_OFFSET 8, .vp8_sad16x8x3_ssse3
328
PROCESS_16X8X3_OFFSET 9, .vp8_sad16x8x3_ssse3
329
PROCESS_16X8X3_OFFSET 10, .vp8_sad16x8x3_ssse3
330
PROCESS_16X8X3_OFFSET 11, .vp8_sad16x8x3_ssse3
331
PROCESS_16X8X3_OFFSET 12, .vp8_sad16x8x3_ssse3
332
PROCESS_16X8X3_OFFSET 13, .vp8_sad16x8x3_ssse3
333
PROCESS_16X8X3_OFFSET 14, .vp8_sad16x8x3_ssse3
335
.vp8_sad16x8x3_ssse3_aligned_by_15:
342
.vp8_sad16x8x3_ssse3_store_off:
343
mov rdi, arg(4) ;Results