2
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4
; Use of this source code is governed by a BSD-style license
5
; that can be found in the LICENSE file in the root of the source
6
; tree. An additional intellectual property rights grant can be found
7
; in the file PATENTS. All contributing project authors may
8
; be found in the AUTHORS file in the root of the source tree.
12
%include "vpx_ports/x86_abi_support.asm"
13
extern sym(vp8_bilinear_filters_x86_8)
16
%define BLOCK_HEIGHT_WIDTH 4
17
%define vp8_filter_weight 128
18
%define VP8_FILTER_SHIFT 7
21
;void vp8_filter_block1d_h6_mmx
23
; unsigned char *src_ptr,
24
; unsigned short *output_ptr,
25
; unsigned int src_pixels_per_line,
26
; unsigned int pixel_step,
27
; unsigned int output_height,
28
; unsigned int output_width,
31
global sym(vp8_filter_block1d_h6_mmx) PRIVATE
32
sym(vp8_filter_block1d_h6_mmx):
35
SHADOW_ARGS_TO_STACK 7
41
mov rdx, arg(6) ;vp8_filter
43
movq mm1, [rdx + 16] ; do both the negative taps first!!!
44
movq mm2, [rdx + 32] ;
45
movq mm6, [rdx + 48] ;
46
movq mm7, [rdx + 64] ;
48
mov rdi, arg(1) ;output_ptr
49
mov rsi, arg(0) ;src_ptr
50
movsxd rcx, dword ptr arg(4) ;output_height
51
movsxd rax, dword ptr arg(5) ;output_width ; destination pitch?
52
pxor mm0, mm0 ; mm0 = 00000000
55
movq mm3, [rsi-2] ; mm3 = p-2..p5
56
movq mm4, mm3 ; mm4 = p-2..p5
57
psrlq mm3, 8 ; mm3 = p-1..p5
58
punpcklbw mm3, mm0 ; mm3 = p-1..p2
59
pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.
61
movq mm5, mm4 ; mm5 = p-2..p5
62
punpckhbw mm4, mm0 ; mm5 = p2..p5
63
pmullw mm4, mm7 ; mm5 *= kernel 4 modifiers
64
paddsw mm3, mm4 ; mm3 += mm5
66
movq mm4, mm5 ; mm4 = p-2..p5;
67
psrlq mm5, 16 ; mm5 = p0..p5;
68
punpcklbw mm5, mm0 ; mm5 = p0..p3
69
pmullw mm5, mm2 ; mm5 *= kernel 2 modifiers
70
paddsw mm3, mm5 ; mm3 += mm5
72
movq mm5, mm4 ; mm5 = p-2..p5
73
psrlq mm4, 24 ; mm4 = p1..p5
74
punpcklbw mm4, mm0 ; mm4 = p1..p4
75
pmullw mm4, mm6 ; mm5 *= kernel 3 modifiers
76
paddsw mm3, mm4 ; mm3 += mm5
78
; do outer positive taps
80
punpcklbw mm4, mm0 ; mm5 = p3..p6
81
pmullw mm4, [rdx+80] ; mm5 *= kernel 0 modifiers
82
paddsw mm3, mm4 ; mm3 += mm5
84
punpcklbw mm5, mm0 ; mm5 = p-2..p1
85
pmullw mm5, [rdx] ; mm5 *= kernel 5 modifiers
86
paddsw mm3, mm5 ; mm3 += mm5
88
paddsw mm3, [GLOBAL(rd)] ; mm3 += round value
89
psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128
90
packuswb mm3, mm0 ; pack and unpack to saturate
93
movq [rdi], mm3 ; store the results in the destination
96
add rsi, dword ptr arg(2) ;src_pixels_per_line ; next line
99
movsxd r8, dword ptr arg(2) ;src_pixels_per_line
102
add rsi, r8 ; next line
105
dec rcx ; decrement count
106
jnz .nextrow ; next row
117
;void vp8_filter_block1dc_v6_mmx
120
; unsigned char *output_ptr,
122
; unsigned int pixels_per_line,
123
; unsigned int pixel_step,
124
; unsigned int output_height,
125
; unsigned int output_width,
128
global sym(vp8_filter_block1dc_v6_mmx) PRIVATE
129
sym(vp8_filter_block1dc_v6_mmx):
132
SHADOW_ARGS_TO_STACK 8
138
movq mm5, [GLOBAL(rd)]
140
mov rbx, arg(7) ;vp8_filter
141
movq mm1, [rbx + 16] ; do both the negative taps first!!!
142
movq mm2, [rbx + 32] ;
143
movq mm6, [rbx + 48] ;
144
movq mm7, [rbx + 64] ;
146
movsxd rdx, dword ptr arg(3) ;pixels_per_line
147
mov rdi, arg(1) ;output_ptr
148
mov rsi, arg(0) ;src_ptr
151
movsxd rcx, DWORD PTR arg(5) ;output_height
152
movsxd rax, DWORD PTR arg(2) ;output_pitch ; destination pitch?
153
pxor mm0, mm0 ; mm0 = 00000000
157
movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1
158
pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.
161
movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 2
162
pmullw mm4, mm7 ; mm4 *= kernel 4 modifiers.
163
paddsw mm3, mm4 ; mm3 += mm4
165
movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 0
166
pmullw mm4, mm2 ; mm4 *= kernel 2 modifiers.
167
paddsw mm3, mm4 ; mm3 += mm4
169
movq mm4, [rsi] ; mm4 = p0..p3 = row -2
170
pmullw mm4, [rbx] ; mm4 *= kernel 0 modifiers.
171
paddsw mm3, mm4 ; mm3 += mm4
174
add rsi, rdx ; move source forward 1 line to avoid 3 * pitch
175
movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 1
176
pmullw mm4, mm6 ; mm4 *= kernel 3 modifiers.
177
paddsw mm3, mm4 ; mm3 += mm4
179
movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 3
180
pmullw mm4, [rbx +80] ; mm4 *= kernel 3 modifiers.
181
paddsw mm3, mm4 ; mm3 += mm4
184
paddsw mm3, mm5 ; mm3 += round value
185
psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128
186
packuswb mm3, mm0 ; pack and saturate
188
movd [rdi],mm3 ; store the results in the destination
189
; the subsequent iterations repeat 3 out of 4 of these reads. Since the
190
; recon block should be in cache this shouldn't cost much. Its obviously
193
dec rcx ; decrement count
194
jnz .nextrow_cv ; next row
207
;void bilinear_predict8x8_mmx
209
; unsigned char *src_ptr,
210
; int src_pixels_per_line,
213
; unsigned char *dst_ptr,
216
global sym(vp8_bilinear_predict8x8_mmx) PRIVATE
217
sym(vp8_bilinear_predict8x8_mmx):
220
SHADOW_ARGS_TO_STACK 6
226
;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
227
;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
229
movsxd rax, dword ptr arg(2) ;xoffset
230
mov rdi, arg(4) ;dst_ptr ;
232
shl rax, 5 ; offset * 32
233
lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
235
add rax, rcx ; HFilter
236
mov rsi, arg(0) ;src_ptr ;
238
movsxd rdx, dword ptr arg(5) ;dst_pitch
242
movsxd rax, dword ptr arg(3) ;yoffset
246
shl rax, 5 ; offset*32
247
add rax, rcx ; VFilter
249
lea rcx, [rdi+rdx*8] ;
250
movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
254
; get the first horizontal line done ;
255
movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
256
movq mm4, mm3 ; make a copy of current line
258
punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
276
paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
277
psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
279
paddw mm4, [GLOBAL(rd)] ;
280
psraw mm4, VP8_FILTER_SHIFT ;
285
add rsi, rdx ; next line
287
movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
288
movq mm4, mm3 ; make a copy of current line
290
punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
317
paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
318
psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
320
paddw mm4, [GLOBAL(rd)] ;
321
psraw mm4, VP8_FILTER_SHIFT ;
327
pmullw mm3, [rax+16] ;
328
pmullw mm4, [rax+16] ;
334
paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
335
psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
337
paddw mm4, [GLOBAL(rd)] ;
338
psraw mm4, VP8_FILTER_SHIFT ;
342
movq [rdi], mm3 ; store the results in the destination
345
add rsi, rdx ; next line
346
add rdi, dword ptr arg(5) ;dst_pitch ;
348
movsxd r8, dword ptr arg(5) ;dst_pitch
349
add rsi, rdx ; next line
350
add rdi, r8 ;dst_pitch
364
;void bilinear_predict8x4_mmx
366
; unsigned char *src_ptr,
367
; int src_pixels_per_line,
370
; unsigned char *dst_ptr,
373
global sym(vp8_bilinear_predict8x4_mmx) PRIVATE
374
sym(vp8_bilinear_predict8x4_mmx):
377
SHADOW_ARGS_TO_STACK 6
383
;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
384
;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
386
movsxd rax, dword ptr arg(2) ;xoffset
387
mov rdi, arg(4) ;dst_ptr ;
389
lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
392
mov rsi, arg(0) ;src_ptr ;
395
movsxd rdx, dword ptr arg(5) ;dst_pitch
399
movsxd rax, dword ptr arg(3) ;yoffset
405
lea rcx, [rdi+rdx*4] ;
407
movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
409
; get the first horizontal line done ;
410
movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
411
movq mm4, mm3 ; make a copy of current line
413
punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
431
paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
432
psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
434
paddw mm4, [GLOBAL(rd)] ;
435
psraw mm4, VP8_FILTER_SHIFT ;
440
add rsi, rdx ; next line
442
movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
443
movq mm4, mm3 ; make a copy of current line
445
punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
472
paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
473
psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
475
paddw mm4, [GLOBAL(rd)] ;
476
psraw mm4, VP8_FILTER_SHIFT ;
482
pmullw mm3, [rax+16] ;
483
pmullw mm4, [rax+16] ;
489
paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
490
psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
492
paddw mm4, [GLOBAL(rd)] ;
493
psraw mm4, VP8_FILTER_SHIFT ;
497
movq [rdi], mm3 ; store the results in the destination
500
add rsi, rdx ; next line
501
add rdi, dword ptr arg(5) ;dst_pitch ;
503
movsxd r8, dword ptr arg(5) ;dst_pitch
504
add rsi, rdx ; next line
519
;void bilinear_predict4x4_mmx
521
; unsigned char *src_ptr,
522
; int src_pixels_per_line,
525
; unsigned char *dst_ptr,
528
global sym(vp8_bilinear_predict4x4_mmx) PRIVATE
529
sym(vp8_bilinear_predict4x4_mmx):
532
SHADOW_ARGS_TO_STACK 6
538
;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
539
;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
541
movsxd rax, dword ptr arg(2) ;xoffset
542
mov rdi, arg(4) ;dst_ptr ;
544
lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
547
add rax, rcx ; HFilter
548
mov rsi, arg(0) ;src_ptr ;
550
movsxd rdx, dword ptr arg(5) ;ldst_pitch
554
movsxd rax, dword ptr arg(3) ;yoffset
560
lea rcx, [rdi+rdx*4] ;
562
movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
564
; get the first horizontal line done ;
565
movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
566
punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
575
paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
577
psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
582
add rsi, rdx ; next line
584
movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
585
punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
599
paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
601
psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
606
pmullw mm3, [rax+16] ;
610
paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
611
psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
614
movd [rdi], mm3 ; store the results in the destination
617
add rsi, rdx ; next line
618
add rdi, dword ptr arg(5) ;dst_pitch ;
620
movsxd r8, dword ptr arg(5) ;dst_pitch ;
621
add rsi, rdx ; next line
644
global HIDDEN_DATA(sym(vp8_six_tap_mmx))
645
sym(vp8_six_tap_mmx):