1
;/*****************************************************************************
3
; * XVID MPEG-4 VIDEO CODEC
4
; * - mmx 8x8 block-based halfpel interpolation -
6
; * Copyright(C) 2002 Michael Militzer <isibaar@xvid.org>
7
; * 2002 Pascal Massimino <skal@planet-d.net>
8
; * 2004 Andre Werthmann <wertmann@aei.mpg.de>
10
; * This program is free software ; you can redistribute it and/or modify
11
; * it under the terms of the GNU General Public License as published by
12
; * the Free Software Foundation ; either version 2 of the License, or
13
; * (at your option) any later version.
15
; * This program is distributed in the hope that it will be useful,
16
; * but WITHOUT ANY WARRANTY ; without even the implied warranty of
17
; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18
; * GNU General Public License for more details.
20
; * You should have received a copy of the GNU General Public License
21
; * along with this program ; if not, write to the Free Software
22
; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24
; ****************************************************************************/
31
global _%1:function %1.endfunc-%1
32
%define %1 _%1:function %1.endfunc-%1
39
global %1:function %1.endfunc-%1
46
;=============================================================================
48
;=============================================================================
53
SECTION .rodata align=16
60
SECTION .text align=16
62
cglobal interpolate8x8_halfpel_h_x86_64
63
cglobal interpolate8x8_halfpel_v_x86_64
64
cglobal interpolate8x8_halfpel_hv_x86_64
66
cglobal interpolate8x8_halfpel_add_x86_64
67
cglobal interpolate8x8_halfpel_h_add_x86_64
68
cglobal interpolate8x8_halfpel_v_add_x86_64
69
cglobal interpolate8x8_halfpel_hv_add_x86_64
71
;===========================================================================
73
; void interpolate8x8_halfpel_h_x86_64(uint8_t * const dst,
74
; const uint8_t * const src,
75
; const uint32_t stride,
76
; const uint32_t rounding);
78
;===========================================================================
80
%macro COPY_H_SSE_RND0 0
84
pavgb mm1, [rax+rdx+1]
90
%macro COPY_H_SSE_RND1 0
111
interpolate8x8_halfpel_h_x86_64:
113
mov rax, rcx ; rounding
131
; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
132
movq mm7, [mmx_one wrt rip]
143
;===========================================================================
145
; void interpolate8x8_halfpel_v_x86_64(uint8_t * const dst,
146
; const uint8_t * const src,
147
; const uint32_t stride,
148
; const uint32_t rounding);
150
;===========================================================================
152
%macro COPY_V_SSE_RND0 0
156
pavgb mm1, [rax+2*rdx]
162
%macro COPY_V_SSE_RND1 0
173
pand mm4, mm7 ; lsb's of (i^j)...
174
pand mm5, mm7 ; lsb's of (i^j)...
175
psubb mm0, mm4 ; ...are substracted from result of pavgb
177
psubb mm1, mm5 ; ...are substracted from result of pavgb
182
interpolate8x8_halfpel_v_x86_64:
183
mov rax, rcx ; rounding
189
; we process 2 line at a time
202
; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
203
movq mm7, [mmx_one wrt rip]
204
movq mm2, [rax] ; loop invariant
217
;===========================================================================
219
; void interpolate8x8_halfpel_hv_xmm(uint8_t * const dst,
220
; const uint8_t * const src,
221
; const uint32_t stride,
222
; const uint32_t rounding);
225
;===========================================================================
227
; The trick is to correct the result of 'pavgb' with some combination of the
228
; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgb' (s and t).
229
; The boolean relations are:
230
; (i+j+k+l+3)/4 = (s+t+1)/2 - (ij&kl)&st
231
; (i+j+k+l+2)/4 = (s+t+1)/2 - (ij|kl)&st
232
; (i+j+k+l+1)/4 = (s+t+1)/2 - (ij&kl)|st
233
; (i+j+k+l+0)/4 = (s+t+1)/2 - (ij|kl)|st
234
; with s=(i+j+1)/2, t=(k+l+1)/2, ij = i^j, kl = k^l, st = s^t.
236
; Moreover, we process 2 lines at a times, for better overlapping (~15% faster).
238
%macro COPY_HV_SSE_RND0 0
245
pavgb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step
247
pxor mm1, mm6 ; mm1=(j^k). preserved for next step
249
por mm3, mm1 ; ij |= jk
251
pxor mm6, mm0 ; mm6 = s^t
252
pand mm3, mm6 ; (ij|jk) &= st
253
pavgb mm2, mm0 ; mm2 = (s+t+1)/2
254
pand mm3, mm7 ; mask lsb
255
psubb mm2, mm3 ; apply.
262
pavgb mm2, mm3 ; preserved for next iteration
264
pxor mm3, mm6 ; preserved for next iteration
278
%macro COPY_HV_SSE_RND1 0
285
pavgb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step
287
pxor mm1, mm6 ; mm1=(j^k). preserved for next step
302
pavgb mm2, mm3 ; preserved for next iteration
304
pxor mm3, mm6 ; preserved for next iteration
318
interpolate8x8_halfpel_hv_x86_64:
319
mov rax, rcx ; rounding
325
movq mm7, [mmx_one wrt rip]
327
; loop invariants: mm2=(i+j+1)/2 and mm3= i^j
332
pxor mm3, mm6 ; mm2/mm3 ready
356
;===========================================================================
358
; The next functions combine both source halfpel interpolation step and the
359
; averaging (with rouding) step to avoid wasting memory bandwidth computing
360
; intermediate halfpel images and then averaging them.
362
;===========================================================================
368
mov r8, rdx ; saves rounding
375
test r8, 1; Rounding?
381
;===========================================================================
383
; void interpolate8x8_halfpel_add_xmm(uint8_t * const dst,
384
; const uint8_t * const src,
385
; const uint32_t stride,
386
; const uint32_t rounding);
389
;===========================================================================
401
interpolate8x8_halfpel_add_x86_64: ; 23c
416
;===========================================================================
418
; void interpolate8x8_halfpel_h_add_xmm(uint8_t * const dst,
419
; const uint8_t * const src,
420
; const uint32_t stride,
421
; const uint32_t rounding);
424
;===========================================================================
430
pavgb mm0, [rax+%1+1]
431
pavgb mm1, [rax+%2+1]
450
pand mm2, [mmx_one wrt rip]
451
pand mm3, [mmx_one wrt rip]
461
interpolate8x8_halfpel_h_add_x86_64: ; 32c
477
; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
478
; movq mm7, [mmx_one wrt rip]
493
;===========================================================================
495
; void interpolate8x8_halfpel_v_add_x86_64(uint8_t * const dst,
496
; const uint8_t * const src,
497
; const uint32_t stride,
498
; const uint32_t rounding);
501
;===========================================================================
503
%macro ADD_8_HF_RND0 0
507
pavgb mm1, [rax+2*rdx]
515
%macro ADD_8_HF_RND1 0
517
movq mm2, [rax+2*rdx]
525
pand mm4, mm7 ; lsb's of (i^j)...
526
pand mm5, mm7 ; lsb's of (i^j)...
527
psubb mm0, mm4 ; ...are substracted from result of pavgb
530
psubb mm1, mm5 ; ...are substracted from result of pavgb
536
interpolate8x8_halfpel_v_add_x86_64:
540
pxor mm7, mm7 ; this is a NOP
552
movq mm0, [rax] ; loop invariant
553
movq mm7, [mmx_one wrt rip]
568
; The trick is to correct the result of 'pavgb' with some combination of the
569
; lsb's of the 4 input values i,j,k,l, and their intermediate 'pavgb' (s and t).
570
; The boolean relations are:
571
; (i+j+k+l+3)/4 = (s+t+1)/2 - (ij&kl)&st
572
; (i+j+k+l+2)/4 = (s+t+1)/2 - (ij|kl)&st
573
; (i+j+k+l+1)/4 = (s+t+1)/2 - (ij&kl)|st
574
; (i+j+k+l+0)/4 = (s+t+1)/2 - (ij|kl)|st
575
; with s=(i+j+1)/2, t=(k+l+1)/2, ij = i^j, kl = k^l, st = s^t.
577
; Moreover, we process 2 lines at a times, for better overlapping (~15% faster).
579
;===========================================================================
581
; void interpolate8x8_halfpel_hv_add_x86_64(uint8_t * const dst,
582
; const uint8_t * const src,
583
; const uint32_t stride,
584
; const uint32_t rounding);
587
;===========================================================================
596
pavgb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step
598
pxor mm1, mm6 ; mm1=(j^k). preserved for next step
600
por mm3, mm1 ; ij |= jk
602
pxor mm6, mm0 ; mm6 = s^t
603
pand mm3, mm6 ; (ij|jk) &= st
604
pavgb mm2, mm0 ; mm2 = (s+t+1)/2
605
pand mm3, mm7 ; mask lsb
606
psubb mm2, mm3 ; apply.
614
pavgb mm2, mm3 ; preserved for next iteration
616
pxor mm3, mm6 ; preserved for next iteration
638
pavgb mm0, mm1 ; mm0=(j+k+1)/2. preserved for next step
640
pxor mm1, mm6 ; mm1=(j^k). preserved for next step
656
pavgb mm2, mm3 ; preserved for next iteration
658
pxor mm3, mm6 ; preserved for next iteration
673
interpolate8x8_halfpel_hv_add_x86_64:
676
movq mm7, [mmx_one wrt rip]
678
; loop invariants: mm2=(i+j+1)/2 and mm3= i^j
683
pxor mm3, mm6 ; mm2/mm3 ready