~ubuntu-branches/ubuntu/vivid/gstreamer-vaapi/vivid

« back to all changes in this revision

Viewing changes to ext/libvpx/upstream/vp8/common/x86/sad_ssse3.asm

  • Committer: Package Import Robot
  • Author(s): Vincent Cheng
  • Date: 2014-08-06 23:56:00 UTC
  • mfrom: (0.1.4 sid) (1.1.3)
  • Revision ID: package-import@ubuntu.com-20140806235600-fg1kcmiu67k315q5
Tags: 0.5.9-2
* Remove spurious build-deps: libva-drm1, libavcodec-dev. (Closes: #757283)
* Drop Build-Depends-Indep and build docs unconditionally on all archs.

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
;
 
2
;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 
3
;
 
4
;  Use of this source code is governed by a BSD-style license
 
5
;  that can be found in the LICENSE file in the root of the source
 
6
;  tree. An additional intellectual property rights grant can be found
 
7
;  in the file PATENTS.  All contributing project authors may
 
8
;  be found in the AUTHORS file in the root of the source tree.
 
9
;
 
10
 
 
11
 
 
12
%include "vpx_ports/x86_abi_support.asm"
 
13
 
 
14
%macro PROCESS_16X2X3 1
 
15
%if %1
 
16
        movdqa          xmm0,       XMMWORD PTR [rsi]
 
17
        lddqu           xmm5,       XMMWORD PTR [rdi]
 
18
        lddqu           xmm6,       XMMWORD PTR [rdi+1]
 
19
        lddqu           xmm7,       XMMWORD PTR [rdi+2]
 
20
 
 
21
        psadbw          xmm5,       xmm0
 
22
        psadbw          xmm6,       xmm0
 
23
        psadbw          xmm7,       xmm0
 
24
%else
 
25
        movdqa          xmm0,       XMMWORD PTR [rsi]
 
26
        lddqu           xmm1,       XMMWORD PTR [rdi]
 
27
        lddqu           xmm2,       XMMWORD PTR [rdi+1]
 
28
        lddqu           xmm3,       XMMWORD PTR [rdi+2]
 
29
 
 
30
        psadbw          xmm1,       xmm0
 
31
        psadbw          xmm2,       xmm0
 
32
        psadbw          xmm3,       xmm0
 
33
 
 
34
        paddw           xmm5,       xmm1
 
35
        paddw           xmm6,       xmm2
 
36
        paddw           xmm7,       xmm3
 
37
%endif
 
38
        movdqa          xmm0,       XMMWORD PTR [rsi+rax]
 
39
        lddqu           xmm1,       XMMWORD PTR [rdi+rdx]
 
40
        lddqu           xmm2,       XMMWORD PTR [rdi+rdx+1]
 
41
        lddqu           xmm3,       XMMWORD PTR [rdi+rdx+2]
 
42
 
 
43
        lea             rsi,        [rsi+rax*2]
 
44
        lea             rdi,        [rdi+rdx*2]
 
45
 
 
46
        psadbw          xmm1,       xmm0
 
47
        psadbw          xmm2,       xmm0
 
48
        psadbw          xmm3,       xmm0
 
49
 
 
50
        paddw           xmm5,       xmm1
 
51
        paddw           xmm6,       xmm2
 
52
        paddw           xmm7,       xmm3
 
53
%endmacro
 
54
 
 
55
%macro PROCESS_16X2X3_OFFSET 2
 
56
%if %1
 
57
        movdqa          xmm0,       XMMWORD PTR [rsi]
 
58
        movdqa          xmm4,       XMMWORD PTR [rdi]
 
59
        movdqa          xmm7,       XMMWORD PTR [rdi+16]
 
60
 
 
61
        movdqa          xmm5,       xmm7
 
62
        palignr         xmm5,       xmm4,       %2
 
63
 
 
64
        movdqa          xmm6,       xmm7
 
65
        palignr         xmm6,       xmm4,       (%2+1)
 
66
 
 
67
        palignr         xmm7,       xmm4,       (%2+2)
 
68
 
 
69
        psadbw          xmm5,       xmm0
 
70
        psadbw          xmm6,       xmm0
 
71
        psadbw          xmm7,       xmm0
 
72
%else
 
73
        movdqa          xmm0,       XMMWORD PTR [rsi]
 
74
        movdqa          xmm4,       XMMWORD PTR [rdi]
 
75
        movdqa          xmm3,       XMMWORD PTR [rdi+16]
 
76
 
 
77
        movdqa          xmm1,       xmm3
 
78
        palignr         xmm1,       xmm4,       %2
 
79
 
 
80
        movdqa          xmm2,       xmm3
 
81
        palignr         xmm2,       xmm4,       (%2+1)
 
82
 
 
83
        palignr         xmm3,       xmm4,       (%2+2)
 
84
 
 
85
        psadbw          xmm1,       xmm0
 
86
        psadbw          xmm2,       xmm0
 
87
        psadbw          xmm3,       xmm0
 
88
 
 
89
        paddw           xmm5,       xmm1
 
90
        paddw           xmm6,       xmm2
 
91
        paddw           xmm7,       xmm3
 
92
%endif
 
93
        movdqa          xmm0,       XMMWORD PTR [rsi+rax]
 
94
        movdqa          xmm4,       XMMWORD PTR [rdi+rdx]
 
95
        movdqa          xmm3,       XMMWORD PTR [rdi+rdx+16]
 
96
 
 
97
        movdqa          xmm1,       xmm3
 
98
        palignr         xmm1,       xmm4,       %2
 
99
 
 
100
        movdqa          xmm2,       xmm3
 
101
        palignr         xmm2,       xmm4,       (%2+1)
 
102
 
 
103
        palignr         xmm3,       xmm4,       (%2+2)
 
104
 
 
105
        lea             rsi,        [rsi+rax*2]
 
106
        lea             rdi,        [rdi+rdx*2]
 
107
 
 
108
        psadbw          xmm1,       xmm0
 
109
        psadbw          xmm2,       xmm0
 
110
        psadbw          xmm3,       xmm0
 
111
 
 
112
        paddw           xmm5,       xmm1
 
113
        paddw           xmm6,       xmm2
 
114
        paddw           xmm7,       xmm3
 
115
%endmacro
 
116
 
 
117
%macro PROCESS_16X16X3_OFFSET 2
 
118
%2_aligned_by_%1:
 
119
 
 
120
        sub             rdi,        %1
 
121
 
 
122
        PROCESS_16X2X3_OFFSET 1, %1
 
123
        PROCESS_16X2X3_OFFSET 0, %1
 
124
        PROCESS_16X2X3_OFFSET 0, %1
 
125
        PROCESS_16X2X3_OFFSET 0, %1
 
126
        PROCESS_16X2X3_OFFSET 0, %1
 
127
        PROCESS_16X2X3_OFFSET 0, %1
 
128
        PROCESS_16X2X3_OFFSET 0, %1
 
129
        PROCESS_16X2X3_OFFSET 0, %1
 
130
 
 
131
        jmp             %2_store_off
 
132
 
 
133
%endmacro
 
134
 
 
135
%macro PROCESS_16X8X3_OFFSET 2
 
136
%2_aligned_by_%1:
 
137
 
 
138
        sub             rdi,        %1
 
139
 
 
140
        PROCESS_16X2X3_OFFSET 1, %1
 
141
        PROCESS_16X2X3_OFFSET 0, %1
 
142
        PROCESS_16X2X3_OFFSET 0, %1
 
143
        PROCESS_16X2X3_OFFSET 0, %1
 
144
 
 
145
        jmp             %2_store_off
 
146
 
 
147
%endmacro
 
148
 
 
149
;void int vp8_sad16x16x3_ssse3(
 
150
;    unsigned char *src_ptr,
 
151
;    int  src_stride,
 
152
;    unsigned char *ref_ptr,
 
153
;    int  ref_stride,
 
154
;    int  *results)
 
155
global sym(vp8_sad16x16x3_ssse3) PRIVATE
 
156
sym(vp8_sad16x16x3_ssse3):
 
157
    push        rbp
 
158
    mov         rbp, rsp
 
159
    SHADOW_ARGS_TO_STACK 5
 
160
    SAVE_XMM 7
 
161
    push        rsi
 
162
    push        rdi
 
163
    push        rcx
 
164
    ; end prolog
 
165
 
 
166
        mov             rsi,        arg(0) ;src_ptr
 
167
        mov             rdi,        arg(2) ;ref_ptr
 
168
 
 
169
        mov             rdx,        0xf
 
170
        and             rdx,        rdi
 
171
 
 
172
        jmp .vp8_sad16x16x3_ssse3_skiptable
 
173
.vp8_sad16x16x3_ssse3_jumptable:
 
174
        dd .vp8_sad16x16x3_ssse3_aligned_by_0  - .vp8_sad16x16x3_ssse3_do_jump
 
175
        dd .vp8_sad16x16x3_ssse3_aligned_by_1  - .vp8_sad16x16x3_ssse3_do_jump
 
176
        dd .vp8_sad16x16x3_ssse3_aligned_by_2  - .vp8_sad16x16x3_ssse3_do_jump
 
177
        dd .vp8_sad16x16x3_ssse3_aligned_by_3  - .vp8_sad16x16x3_ssse3_do_jump
 
178
        dd .vp8_sad16x16x3_ssse3_aligned_by_4  - .vp8_sad16x16x3_ssse3_do_jump
 
179
        dd .vp8_sad16x16x3_ssse3_aligned_by_5  - .vp8_sad16x16x3_ssse3_do_jump
 
180
        dd .vp8_sad16x16x3_ssse3_aligned_by_6  - .vp8_sad16x16x3_ssse3_do_jump
 
181
        dd .vp8_sad16x16x3_ssse3_aligned_by_7  - .vp8_sad16x16x3_ssse3_do_jump
 
182
        dd .vp8_sad16x16x3_ssse3_aligned_by_8  - .vp8_sad16x16x3_ssse3_do_jump
 
183
        dd .vp8_sad16x16x3_ssse3_aligned_by_9  - .vp8_sad16x16x3_ssse3_do_jump
 
184
        dd .vp8_sad16x16x3_ssse3_aligned_by_10 - .vp8_sad16x16x3_ssse3_do_jump
 
185
        dd .vp8_sad16x16x3_ssse3_aligned_by_11 - .vp8_sad16x16x3_ssse3_do_jump
 
186
        dd .vp8_sad16x16x3_ssse3_aligned_by_12 - .vp8_sad16x16x3_ssse3_do_jump
 
187
        dd .vp8_sad16x16x3_ssse3_aligned_by_13 - .vp8_sad16x16x3_ssse3_do_jump
 
188
        dd .vp8_sad16x16x3_ssse3_aligned_by_14 - .vp8_sad16x16x3_ssse3_do_jump
 
189
        dd .vp8_sad16x16x3_ssse3_aligned_by_15 - .vp8_sad16x16x3_ssse3_do_jump
 
190
.vp8_sad16x16x3_ssse3_skiptable:
 
191
 
 
192
        call .vp8_sad16x16x3_ssse3_do_jump
 
193
.vp8_sad16x16x3_ssse3_do_jump:
 
194
        pop             rcx                         ; get the address of do_jump
 
195
        mov             rax,  .vp8_sad16x16x3_ssse3_jumptable - .vp8_sad16x16x3_ssse3_do_jump
 
196
        add             rax,  rcx  ; get the absolute address of vp8_sad16x16x3_ssse3_jumptable
 
197
 
 
198
        movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
 
199
        add             rcx,        rax
 
200
 
 
201
        movsxd          rax,        dword ptr arg(1) ;src_stride
 
202
        movsxd          rdx,        dword ptr arg(3) ;ref_stride
 
203
 
 
204
        jmp             rcx
 
205
 
 
206
        PROCESS_16X16X3_OFFSET 0,  .vp8_sad16x16x3_ssse3
 
207
        PROCESS_16X16X3_OFFSET 1,  .vp8_sad16x16x3_ssse3
 
208
        PROCESS_16X16X3_OFFSET 2,  .vp8_sad16x16x3_ssse3
 
209
        PROCESS_16X16X3_OFFSET 3,  .vp8_sad16x16x3_ssse3
 
210
        PROCESS_16X16X3_OFFSET 4,  .vp8_sad16x16x3_ssse3
 
211
        PROCESS_16X16X3_OFFSET 5,  .vp8_sad16x16x3_ssse3
 
212
        PROCESS_16X16X3_OFFSET 6,  .vp8_sad16x16x3_ssse3
 
213
        PROCESS_16X16X3_OFFSET 7,  .vp8_sad16x16x3_ssse3
 
214
        PROCESS_16X16X3_OFFSET 8,  .vp8_sad16x16x3_ssse3
 
215
        PROCESS_16X16X3_OFFSET 9,  .vp8_sad16x16x3_ssse3
 
216
        PROCESS_16X16X3_OFFSET 10, .vp8_sad16x16x3_ssse3
 
217
        PROCESS_16X16X3_OFFSET 11, .vp8_sad16x16x3_ssse3
 
218
        PROCESS_16X16X3_OFFSET 12, .vp8_sad16x16x3_ssse3
 
219
        PROCESS_16X16X3_OFFSET 13, .vp8_sad16x16x3_ssse3
 
220
        PROCESS_16X16X3_OFFSET 14, .vp8_sad16x16x3_ssse3
 
221
 
 
222
.vp8_sad16x16x3_ssse3_aligned_by_15:
 
223
        PROCESS_16X2X3 1
 
224
        PROCESS_16X2X3 0
 
225
        PROCESS_16X2X3 0
 
226
        PROCESS_16X2X3 0
 
227
        PROCESS_16X2X3 0
 
228
        PROCESS_16X2X3 0
 
229
        PROCESS_16X2X3 0
 
230
        PROCESS_16X2X3 0
 
231
 
 
232
.vp8_sad16x16x3_ssse3_store_off:
 
233
        mov             rdi,        arg(4) ;Results
 
234
 
 
235
        movq            xmm0,       xmm5
 
236
        psrldq          xmm5,       8
 
237
 
 
238
        paddw           xmm0,       xmm5
 
239
        movd            [rdi],      xmm0
 
240
;-
 
241
        movq            xmm0,       xmm6
 
242
        psrldq          xmm6,       8
 
243
 
 
244
        paddw           xmm0,       xmm6
 
245
        movd            [rdi+4],    xmm0
 
246
;-
 
247
        movq            xmm0,       xmm7
 
248
        psrldq          xmm7,       8
 
249
 
 
250
        paddw           xmm0,       xmm7
 
251
        movd            [rdi+8],    xmm0
 
252
 
 
253
    ; begin epilog
 
254
    pop         rcx
 
255
    pop         rdi
 
256
    pop         rsi
 
257
    RESTORE_XMM
 
258
    UNSHADOW_ARGS
 
259
    pop         rbp
 
260
    ret
 
261
 
 
262
;void int vp8_sad16x8x3_ssse3(
 
263
;    unsigned char *src_ptr,
 
264
;    int  src_stride,
 
265
;    unsigned char *ref_ptr,
 
266
;    int  ref_stride,
 
267
;    int  *results)
 
268
global sym(vp8_sad16x8x3_ssse3) PRIVATE
 
269
sym(vp8_sad16x8x3_ssse3):
 
270
    push        rbp
 
271
    mov         rbp, rsp
 
272
    SHADOW_ARGS_TO_STACK 5
 
273
    SAVE_XMM 7
 
274
    push        rsi
 
275
    push        rdi
 
276
    push        rcx
 
277
    ; end prolog
 
278
 
 
279
        mov             rsi,        arg(0) ;src_ptr
 
280
        mov             rdi,        arg(2) ;ref_ptr
 
281
 
 
282
        mov             rdx,        0xf
 
283
        and             rdx,        rdi
 
284
 
 
285
        jmp .vp8_sad16x8x3_ssse3_skiptable
 
286
.vp8_sad16x8x3_ssse3_jumptable:
 
287
        dd .vp8_sad16x8x3_ssse3_aligned_by_0  - .vp8_sad16x8x3_ssse3_do_jump
 
288
        dd .vp8_sad16x8x3_ssse3_aligned_by_1  - .vp8_sad16x8x3_ssse3_do_jump
 
289
        dd .vp8_sad16x8x3_ssse3_aligned_by_2  - .vp8_sad16x8x3_ssse3_do_jump
 
290
        dd .vp8_sad16x8x3_ssse3_aligned_by_3  - .vp8_sad16x8x3_ssse3_do_jump
 
291
        dd .vp8_sad16x8x3_ssse3_aligned_by_4  - .vp8_sad16x8x3_ssse3_do_jump
 
292
        dd .vp8_sad16x8x3_ssse3_aligned_by_5  - .vp8_sad16x8x3_ssse3_do_jump
 
293
        dd .vp8_sad16x8x3_ssse3_aligned_by_6  - .vp8_sad16x8x3_ssse3_do_jump
 
294
        dd .vp8_sad16x8x3_ssse3_aligned_by_7  - .vp8_sad16x8x3_ssse3_do_jump
 
295
        dd .vp8_sad16x8x3_ssse3_aligned_by_8  - .vp8_sad16x8x3_ssse3_do_jump
 
296
        dd .vp8_sad16x8x3_ssse3_aligned_by_9  - .vp8_sad16x8x3_ssse3_do_jump
 
297
        dd .vp8_sad16x8x3_ssse3_aligned_by_10 - .vp8_sad16x8x3_ssse3_do_jump
 
298
        dd .vp8_sad16x8x3_ssse3_aligned_by_11 - .vp8_sad16x8x3_ssse3_do_jump
 
299
        dd .vp8_sad16x8x3_ssse3_aligned_by_12 - .vp8_sad16x8x3_ssse3_do_jump
 
300
        dd .vp8_sad16x8x3_ssse3_aligned_by_13 - .vp8_sad16x8x3_ssse3_do_jump
 
301
        dd .vp8_sad16x8x3_ssse3_aligned_by_14 - .vp8_sad16x8x3_ssse3_do_jump
 
302
        dd .vp8_sad16x8x3_ssse3_aligned_by_15 - .vp8_sad16x8x3_ssse3_do_jump
 
303
.vp8_sad16x8x3_ssse3_skiptable:
 
304
 
 
305
        call .vp8_sad16x8x3_ssse3_do_jump
 
306
.vp8_sad16x8x3_ssse3_do_jump:
 
307
        pop             rcx                         ; get the address of do_jump
 
308
        mov             rax,  .vp8_sad16x8x3_ssse3_jumptable - .vp8_sad16x8x3_ssse3_do_jump
 
309
        add             rax,  rcx  ; get the absolute address of vp8_sad16x8x3_ssse3_jumptable
 
310
 
 
311
        movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
 
312
        add             rcx,        rax
 
313
 
 
314
        movsxd          rax,        dword ptr arg(1) ;src_stride
 
315
        movsxd          rdx,        dword ptr arg(3) ;ref_stride
 
316
 
 
317
        jmp             rcx
 
318
 
 
319
        PROCESS_16X8X3_OFFSET 0,  .vp8_sad16x8x3_ssse3
 
320
        PROCESS_16X8X3_OFFSET 1,  .vp8_sad16x8x3_ssse3
 
321
        PROCESS_16X8X3_OFFSET 2,  .vp8_sad16x8x3_ssse3
 
322
        PROCESS_16X8X3_OFFSET 3,  .vp8_sad16x8x3_ssse3
 
323
        PROCESS_16X8X3_OFFSET 4,  .vp8_sad16x8x3_ssse3
 
324
        PROCESS_16X8X3_OFFSET 5,  .vp8_sad16x8x3_ssse3
 
325
        PROCESS_16X8X3_OFFSET 6,  .vp8_sad16x8x3_ssse3
 
326
        PROCESS_16X8X3_OFFSET 7,  .vp8_sad16x8x3_ssse3
 
327
        PROCESS_16X8X3_OFFSET 8,  .vp8_sad16x8x3_ssse3
 
328
        PROCESS_16X8X3_OFFSET 9,  .vp8_sad16x8x3_ssse3
 
329
        PROCESS_16X8X3_OFFSET 10, .vp8_sad16x8x3_ssse3
 
330
        PROCESS_16X8X3_OFFSET 11, .vp8_sad16x8x3_ssse3
 
331
        PROCESS_16X8X3_OFFSET 12, .vp8_sad16x8x3_ssse3
 
332
        PROCESS_16X8X3_OFFSET 13, .vp8_sad16x8x3_ssse3
 
333
        PROCESS_16X8X3_OFFSET 14, .vp8_sad16x8x3_ssse3
 
334
 
 
335
.vp8_sad16x8x3_ssse3_aligned_by_15:
 
336
 
 
337
        PROCESS_16X2X3 1
 
338
        PROCESS_16X2X3 0
 
339
        PROCESS_16X2X3 0
 
340
        PROCESS_16X2X3 0
 
341
 
 
342
.vp8_sad16x8x3_ssse3_store_off:
 
343
        mov             rdi,        arg(4) ;Results
 
344
 
 
345
        movq            xmm0,       xmm5
 
346
        psrldq          xmm5,       8
 
347
 
 
348
        paddw           xmm0,       xmm5
 
349
        movd            [rdi],      xmm0
 
350
;-
 
351
        movq            xmm0,       xmm6
 
352
        psrldq          xmm6,       8
 
353
 
 
354
        paddw           xmm0,       xmm6
 
355
        movd            [rdi+4],    xmm0
 
356
;-
 
357
        movq            xmm0,       xmm7
 
358
        psrldq          xmm7,       8
 
359
 
 
360
        paddw           xmm0,       xmm7
 
361
        movd            [rdi+8],    xmm0
 
362
 
 
363
    ; begin epilog
 
364
    pop         rcx
 
365
    pop         rdi
 
366
    pop         rsi
 
367
    RESTORE_XMM
 
368
    UNSHADOW_ARGS
 
369
    pop         rbp
 
370
    ret