2
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4
; Use of this source code is governed by a BSD-style license
5
; that can be found in the LICENSE file in the root of the source
6
; tree. An additional intellectual property rights grant can be found
7
; in the file PATENTS. All contributing project authors may
8
; be found in the AUTHORS file in the root of the source tree.
12
%include "vpx_ports/x86_abi_support.asm"
15
; %1 value not preserved
18
movdqa scratch1, %2 ; v2
20
psubusb scratch1, %1 ; v2 - v1
21
psubusb %1, %2 ; v1 - v2
22
por %1, scratch1 ; abs(v2 - v1)
25
%macro LF_FILTER_HEV_MASK 8-9
27
LF_ABS %1, %2 ; abs(p3 - p2)
28
LF_ABS %2, %3 ; abs(p2 - p1)
29
pmaxub %1, %2 ; accumulate mask
31
movdqa scratch2, %3 ; save p1
32
LF_ABS scratch2, %4 ; abs(p1 - p0)
34
LF_ABS %4, %5 ; abs(p0 - q0)
35
LF_ABS %5, %6 ; abs(q0 - q1)
37
pmaxub %5, scratch2 ; accumulate hev
41
pmaxub %1, %5 ; accumulate mask
43
LF_ABS %3, %6 ; abs(p1 - q1)
44
LF_ABS %6, %7 ; abs(q1 - q2)
45
pmaxub %1, %6 ; accumulate mask
46
LF_ABS %7, %8 ; abs(q2 - q3)
47
pmaxub %1, %7 ; accumulate mask
49
paddusb %4, %4 ; 2 * abs(p0 - q0)
50
pand %3, [GLOBAL(tfe)]
51
psrlw %3, 1 ; abs(p1 - q1) / 2
52
paddusb %4, %3 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
57
pcmpeqb %1, zero ; mask
60
pcmpeqb %5, zero ; ~hev
68
movdqa scratch2, %6 ; save hev
70
pxor %1, [GLOBAL(t80)] ; ps1
71
pxor %4, [GLOBAL(t80)] ; qs1
73
psubsb scratch1, %4 ; signed_char_clamp(ps1 - qs1)
74
pandn scratch2, scratch1 ; vp8_filter &= hev
76
pxor %2, [GLOBAL(t80)] ; ps0
77
pxor %3, [GLOBAL(t80)] ; qs0
79
psubsb scratch1, %2 ; qs0 - ps0
80
paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0)
81
paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0)
82
paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0)
83
pand %5, scratch2 ; &= mask
86
paddsb %5, [GLOBAL(t4)] ; Filter1
87
paddsb scratch2, [GLOBAL(t3)] ; Filter2
93
pand scratch1, [GLOBAL(te0)]
94
pand %5, [GLOBAL(t1f)]
97
psubsb %3, %5 ; qs0 - Filter1
98
pxor %3, [GLOBAL(t80)]
101
movdqa scratch1, zero
102
pcmpgtb scratch1, scratch2
104
pand scratch1, [GLOBAL(te0)]
105
pand scratch2, [GLOBAL(t1f)]
106
por scratch2, scratch1
108
paddsb %2, scratch2 ; ps0 + Filter2
109
pxor %2, [GLOBAL(t80)]
111
; outer tap adjustments
112
paddsb %5, [GLOBAL(t1)]
113
movdqa scratch1, zero
116
pand scratch1, [GLOBAL(t80)]
117
pand %5, [GLOBAL(t7f)]
119
pand %5, %6 ; vp8_filter &= ~hev
121
psubsb %4, %5 ; qs1 - vp8_filter
122
pxor %4, [GLOBAL(t80)]
124
paddsb %1, %5 ; ps1 + vp8_filter
125
pxor %1, [GLOBAL(t80)]
128
;void vp8_loop_filter_bh_y_sse2
130
; unsigned char *src_ptr,
131
; int src_pixel_step,
132
; const char *blimit,
136
global sym(vp8_loop_filter_bh_y_sse2) PRIVATE
137
sym(vp8_loop_filter_bh_y_sse2):
139
%if LIBVPX_YASM_WIN64
140
%define src rcx ; src_ptr
141
%define stride rdx ; src_pixel_step
158
%define src rdi ; src_ptr
159
%define stride rsi ; src_pixel_step
170
%define scratch1 xmm5
171
%define scratch2 xmm6
176
%define i2 [src + 2 * stride]
177
%define i3 [spp + 2 * stride]
178
%define i4 [src + 4 * stride]
179
%define i5 [spp + 4 * stride]
180
%define i6 [src + 2 * stride3]
181
%define i7 [spp + 2 * stride3]
182
%define i8 [src + 8 * stride]
183
%define i9 [spp + 8 * stride]
184
%define i10 [src + 2 * stride5]
185
%define i11 [spp + 2 * stride5]
186
%define i12 [src + 4 * stride3]
187
%define i13 [spp + 4 * stride3]
188
%define i14 [src + 2 * stride7]
189
%define i15 [spp + 2 * stride7]
192
lea spp, [src + stride]
193
lea stride3, [stride + 2 * stride]
194
lea stride5, [stride3 + 2 * stride]
195
lea stride7, [stride3 + 4 * stride]
198
; load the first set into registers
205
movdqa xmm9, i6 ; q2, will contain abs(p1-p0)
207
LF_FILTER_HEV_MASK xmm0, xmm1, xmm2, xmm3, xmm4, xmm8, xmm9, xmm10
213
LF_FILTER xmm1, xmm2, xmm3, xmm8, xmm0, xmm4
225
movdqa xmm10, i10 ; q2, will contain abs(p1-p0)
227
LF_FILTER_HEV_MASK xmm3, xmm8, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm9
233
LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2
245
movdqa xmm9, i14 ; q2, will contain abs(p1-p0)
247
LF_FILTER_HEV_MASK xmm4, xmm8, xmm0, xmm1, xmm2, xmm3, xmm9, xmm11, xmm10
253
LF_FILTER xmm0, xmm1, xmm3, xmm8, xmm4, xmm2
259
%if LIBVPX_YASM_WIN64
269
;void vp8_loop_filter_bv_y_sse2
271
; unsigned char *src_ptr,
272
; int src_pixel_step,
273
; const char *blimit,
278
global sym(vp8_loop_filter_bv_y_sse2) PRIVATE
279
sym(vp8_loop_filter_bv_y_sse2):
281
%if LIBVPX_YASM_WIN64
282
%define src rcx ; src_ptr
283
%define stride rdx ; src_pixel_step
312
%define scratch1 xmm5
313
%define scratch2 xmm6
318
%define s2 [src + 2 * stride]
319
%define s3 [spp + 2 * stride]
320
%define s4 [src + 4 * stride]
321
%define s5 [spp + 4 * stride]
322
%define s6 [src + 2 * stride3]
323
%define s7 [spp + 2 * stride3]
324
%define s8 [src + 8 * stride]
325
%define s9 [spp + 8 * stride]
326
%define s10 [src + 2 * stride5]
327
%define s11 [spp + 2 * stride5]
328
%define s12 [src + 4 * stride3]
329
%define s13 [spp + 4 * stride3]
330
%define s14 [src + 2 * stride7]
331
%define s15 [spp + 2 * stride7]
334
%define i1 [rsp + 16]
335
%define i2 [rsp + 32]
336
%define i3 [rsp + 48]
337
%define i4 [rsp + 64]
338
%define i5 [rsp + 80]
339
%define i6 [rsp + 96]
340
%define i7 [rsp + 112]
341
%define i8 [rsp + 128]
342
%define i9 [rsp + 144]
343
%define i10 [rsp + 160]
344
%define i11 [rsp + 176]
345
%define i12 [rsp + 192]
346
%define i13 [rsp + 208]
347
%define i14 [rsp + 224]
348
%define i15 [rsp + 240]
352
; reserve stack space
353
%define temp_storage 0 ; size is 256 (16*16)
354
%define stack_size 256
358
lea spp, [src + stride]
359
lea stride3, [stride + 2 * stride]
360
lea stride5, [stride3 + 2 * stride]
361
lea stride7, [stride3 + 4 * stride]
366
punpcklbw xmm0, s9 ; 80 90
367
punpckhbw xmm1, s9 ; 88 98
371
punpcklbw xmm2, s11 ; a0 b0
372
punpckhbw xmm3, s11 ; a8 b8
375
punpcklwd xmm0, xmm2 ; 80 90 a0 b0
376
punpckhwd xmm4, xmm2 ; 84 94 a4 b4
379
punpcklwd xmm1, xmm3 ; 88 98 a8 b8
380
punpckhwd xmm2, xmm3 ; 8c 9c ac bc
383
; work on next 4 rows
387
punpcklbw xmm3, s13 ; c0 d0
388
punpckhbw xmm5, s13 ; c8 d8
392
punpcklbw xmm6, s15 ; e0 f0
393
punpckhbw xmm7, s15 ; e8 f8
396
punpcklwd xmm3, xmm6 ; c0 d0 e0 f0
397
punpckhwd xmm8, xmm6 ; c4 d4 e4 f4
400
punpcklwd xmm5, xmm7 ; c8 d8 e8 f8
401
punpckhwd xmm6, xmm7 ; cc dc ec fc
403
; pull the third and fourth sets together
406
punpckldq xmm0, xmm3 ; 80 90 a0 b0 c0 d0 e0 f0
407
punpckhdq xmm7, xmm3 ; 82 92 a2 b2 c2 d2 e2 f2
410
punpckldq xmm4, xmm8 ; 84 94 a4 b4 c4 d4 e4 f4
411
punpckhdq xmm3, xmm8 ; 86 96 a6 b6 c6 d6 e6 f6
414
punpckldq xmm1, xmm5 ; 88 88 a8 b8 c8 d8 e8 f8
415
punpckhdq xmm8, xmm5 ; 8a 9a aa ba ca da ea fa
418
punpckldq xmm2, xmm6 ; 8c 9c ac bc cc dc ec fc
419
punpckhdq xmm5, xmm6 ; 8e 9e ae be ce de ee fe
421
; save the calculations. we only have 15 registers ...
434
punpcklbw xmm0, s1 ; 00 10
435
punpckhbw xmm1, s1 ; 08 18
439
punpcklbw xmm2, s3 ; 20 30
440
punpckhbw xmm3, s3 ; 28 38
443
punpcklwd xmm0, xmm2 ; 00 10 20 30
444
punpckhwd xmm4, xmm2 ; 04 14 24 34
447
punpcklwd xmm1, xmm3 ; 08 18 28 38
448
punpckhwd xmm2, xmm3 ; 0c 1c 2c 3c
451
; work on next 4 rows
455
punpcklbw xmm3, s5 ; 40 50
456
punpckhbw xmm5, s5 ; 48 58
460
punpcklbw xmm6, s7 ; 60 70
461
punpckhbw xmm7, s7 ; 68 78
464
punpcklwd xmm3, xmm6 ; 40 50 60 70
465
punpckhwd xmm8, xmm6 ; 44 54 64 74
468
punpcklwd xmm5, xmm7 ; 48 58 68 78
469
punpckhwd xmm6, xmm7 ; 4c 5c 6c 7c
471
; pull the first two sets together
474
punpckldq xmm0, xmm3 ; 00 10 20 30 40 50 60 70
475
punpckhdq xmm7, xmm3 ; 02 12 22 32 42 52 62 72
478
punpckldq xmm4, xmm8 ; 04 14 24 34 44 54 64 74
479
punpckhdq xmm3, xmm8 ; 06 16 26 36 46 56 66 76
482
punpckldq xmm1, xmm5 ; 08 18 28 38 48 58 68 78
483
punpckhdq xmm8, xmm5 ; 0a 1a 2a 3a 4a 5a 6a 7a
486
punpckldq xmm2, xmm6 ; 0c 1c 2c 3c 4c 5c 6c 7c
487
punpckhdq xmm5, xmm6 ; 0e 1e 2e 3e 4e 5e 6e 7e
539
; TRANSPOSED DATA AVAILABLE ON THE STACK
546
LF_FILTER_HEV_MASK xmm0, xmm12, xmm13, xmm9, xmm4, xmm10, xmm3, xmm11
552
LF_FILTER xmm1, xmm2, xmm8, xmm9, xmm0, xmm4
564
movdqa xmm10, i10 ; q2, will contain abs(p1-p0)
566
LF_FILTER_HEV_MASK xmm8, xmm9, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm3
572
LF_FILTER xmm0, xmm1, xmm3, xmm4, xmm8, xmm2
584
movdqa xmm9, i14 ; q2, will contain abs(p1-p0)
586
LF_FILTER_HEV_MASK xmm3, xmm4, xmm0, xmm1, xmm2, xmm8, xmm9, xmm11, xmm10
592
LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2
599
; RESHUFFLE AND WRITE OUT
603
punpcklbw xmm0, i9 ; 80 90
604
punpckhbw xmm1, i9 ; 88 98
608
punpcklbw xmm2, i11 ; a0 b0
609
punpckhbw xmm3, i11 ; a8 b8
612
punpcklwd xmm0, xmm2 ; 80 90 a0 b0
613
punpckhwd xmm4, xmm2 ; 84 94 a4 b4
616
punpcklwd xmm1, xmm3 ; 88 98 a8 b8
617
punpckhwd xmm2, xmm3 ; 8c 9c ac bc
620
; work on next 4 rows
624
punpcklbw xmm3, i13 ; c0 d0
625
punpckhbw xmm5, i13 ; c8 d8
629
punpcklbw xmm6, i15 ; e0 f0
630
punpckhbw xmm7, i15 ; e8 f8
633
punpcklwd xmm3, xmm6 ; c0 d0 e0 f0
634
punpckhwd xmm8, xmm6 ; c4 d4 e4 f4
637
punpcklwd xmm5, xmm7 ; c8 d8 e8 f8
638
punpckhwd xmm6, xmm7 ; cc dc ec fc
640
; pull the third and fourth sets together
643
punpckldq xmm0, xmm3 ; 80 90 a0 b0 c0 d0 e0 f0
644
punpckhdq xmm7, xmm3 ; 82 92 a2 b2 c2 d2 e2 f2
647
punpckldq xmm4, xmm8 ; 84 94 a4 b4 c4 d4 e4 f4
648
punpckhdq xmm3, xmm8 ; 86 96 a6 b6 c6 d6 e6 f6
651
punpckldq xmm1, xmm5 ; 88 88 a8 b8 c8 d8 e8 f8
652
punpckhdq xmm8, xmm5 ; 8a 9a aa ba ca da ea fa
655
punpckldq xmm2, xmm6 ; 8c 9c ac bc cc dc ec fc
656
punpckhdq xmm5, xmm6 ; 8e 9e ae be ce de ee fe
658
; save the calculations. we only have 15 registers ...
671
punpcklbw xmm0, i1 ; 00 10
672
punpckhbw xmm1, i1 ; 08 18
676
punpcklbw xmm2, i3 ; 20 30
677
punpckhbw xmm3, i3 ; 28 38
680
punpcklwd xmm0, xmm2 ; 00 10 20 30
681
punpckhwd xmm4, xmm2 ; 04 14 24 34
684
punpcklwd xmm1, xmm3 ; 08 18 28 38
685
punpckhwd xmm2, xmm3 ; 0c 1c 2c 3c
688
; work on next 4 rows
692
punpcklbw xmm3, i5 ; 40 50
693
punpckhbw xmm5, i5 ; 48 58
697
punpcklbw xmm6, i7 ; 60 70
698
punpckhbw xmm7, i7 ; 68 78
701
punpcklwd xmm3, xmm6 ; 40 50 60 70
702
punpckhwd xmm8, xmm6 ; 44 54 64 74
705
punpcklwd xmm5, xmm7 ; 48 58 68 78
706
punpckhwd xmm6, xmm7 ; 4c 5c 6c 7c
708
; pull the first two sets together
711
punpckldq xmm0, xmm3 ; 00 10 20 30 40 50 60 70
712
punpckhdq xmm7, xmm3 ; 02 12 22 32 42 52 62 72
715
punpckldq xmm4, xmm8 ; 04 14 24 34 44 54 64 74
716
punpckhdq xmm3, xmm8 ; 06 16 26 36 46 56 66 76
719
punpckldq xmm1, xmm5 ; 08 18 28 38 48 58 68 78
720
punpckhdq xmm8, xmm5 ; 0a 1a 2a 3a 4a 5a 6a 7a
723
punpckldq xmm2, xmm6 ; 0c 1c 2c 3c 4c 5c 6c 7c
724
punpckhdq xmm5, xmm6 ; 0e 1e 2e 3e 4e 5e 6e 7e
737
punpckhqdq xmm10, i10
741
punpckhqdq xmm11, i11
745
punpckhqdq xmm12, i12
749
punpckhqdq xmm13, i13
753
punpckhqdq xmm14, i14
757
punpckhqdq xmm15, i15
782
%if LIBVPX_YASM_WIN64