1
;*****************************************************************************
2
;* predict-a.asm: h264 encoder library
3
;*****************************************************************************
4
;* Copyright (C) 2005 x264 project
6
;* Authors: Loren Merritt <lorenm@u.washington.edu>
8
;* This program is free software; you can redistribute it and/or modify
9
;* it under the terms of the GNU General Public License as published by
10
;* the Free Software Foundation; either version 2 of the License, or
11
;* (at your option) any later version.
13
;* This program is distributed in the hope that it will be useful,
14
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
;* GNU General Public License for more details.
18
;* You should have received a copy of the GNU General Public License
19
;* along with this program; if not, write to the Free Software
20
;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
21
;*****************************************************************************
25
;=============================================================================
26
; Macros and other preprocessor constants
27
;=============================================================================
29
%include "i386inc.asm"
32
movq [edx + 0*FDEC_STRIDE], %1
33
movq [edx + 1*FDEC_STRIDE], %1
34
movq [edx + 2*FDEC_STRIDE], %1
35
movq [edx + 3*FDEC_STRIDE], %1
36
movq [edx + 4*FDEC_STRIDE], %2
37
movq [edx + 5*FDEC_STRIDE], %2
38
movq [edx + 6*FDEC_STRIDE], %2
39
movq [edx + 7*FDEC_STRIDE], %2
66
;=============================================================================
68
;=============================================================================
72
; dest, left, right, src, tmp
73
; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
74
%macro PRED8x8_LOWPASS 5
79
pand %3, [pb_1 GOT_ebx]
85
;-----------------------------------------------------------------------------
86
; void predict_8x8_v_mmxext( uint8_t *src, uint8_t *edge )
87
;-----------------------------------------------------------------------------
88
cglobal predict_8x8_v_mmxext
95
;-----------------------------------------------------------------------------
96
; void predict_8x8_dc_mmxext( uint8_t *src, uint8_t *edge )
97
;-----------------------------------------------------------------------------
98
cglobal predict_8x8_dc_mmxext
101
mov eax, [picesp + 8]
102
mov edx, [picesp + 4]
107
paddw mm0, [pw_8 GOT_ebx]
116
;-----------------------------------------------------------------------------
117
; void predict_8x8_top_mmxext( uint8_t *src, uint8_t *edge )
118
;-----------------------------------------------------------------------------
123
mov eax, [picesp + 8]
124
mov edx, [picesp + 4]
127
paddw mm0, [pw_4 GOT_ebx]
136
PRED8x8_DC predict_8x8_dc_top_mmxext, 16
137
PRED8x8_DC predict_8x8_dc_left_mmxext, 7
139
;-----------------------------------------------------------------------------
140
; void predict_8x8_ddl_mmxext( uint8_t *src, uint8_t *edge )
141
;-----------------------------------------------------------------------------
142
cglobal predict_8x8_ddl_mmxext
145
mov eax, [picesp + 8]
146
mov edx, [picesp + 4]
151
PRED8x8_LOWPASS mm0, mm1, mm2, [eax + 16], mm7
152
PRED8x8_LOWPASS mm1, mm3, mm4, [eax + 24], mm6
156
movq [edx + Y*FDEC_STRIDE], mm1
164
movq [edx + Y*FDEC_STRIDE], mm1
169
movq [edx + Y*FDEC_STRIDE], mm1
174
;-----------------------------------------------------------------------------
175
; void predict_8x8_ddr_mmxext( uint8_t *src, uint8_t *edge )
176
;-----------------------------------------------------------------------------
177
cglobal predict_8x8_ddr_mmxext
180
mov eax, [picesp + 8]
181
mov edx, [picesp + 4]
186
PRED8x8_LOWPASS mm0, mm1, mm2, [eax + 8], mm7
187
PRED8x8_LOWPASS mm1, mm3, mm4, [eax + 16], mm6
191
movq [edx + Y*FDEC_STRIDE], mm0
199
movq [edx + Y*FDEC_STRIDE], mm0
204
movq [edx + Y*FDEC_STRIDE], mm0
209
;-----------------------------------------------------------------------------
210
; void predict_8x8_vr_core_mmxext( uint8_t *src, uint8_t *edge )
211
;-----------------------------------------------------------------------------
213
; fills only some pixels:
224
cglobal predict_8x8_vr_core_mmxext
227
mov eax, [picesp + 8]
228
mov edx, [picesp + 4]
234
PRED8x8_LOWPASS mm0, mm1, mm2, mm4, mm7
238
movq [edx + Y *FDEC_STRIDE], mm3
239
movq [edx + (Y+1)*FDEC_STRIDE], mm0
244
movq [edx + Y *FDEC_STRIDE], mm3
245
movq [edx + (Y+1)*FDEC_STRIDE], mm0
250
;-----------------------------------------------------------------------------
251
; void predict_8x8c_v_mmx( uint8_t *src )
252
;-----------------------------------------------------------------------------
253
cglobal predict_8x8c_v_mmx
255
movq mm0, [edx - FDEC_STRIDE]
259
;-----------------------------------------------------------------------------
260
; void predict_8x8c_dc_core_mmxext( uint8_t *src, int s2, int s3 )
261
;-----------------------------------------------------------------------------
262
cglobal predict_8x8c_dc_core_mmxext
266
mov edx, [picesp + 4]
268
movq mm0, [edx - FDEC_STRIDE]
276
paddw mm0, [picesp + 8]
277
pshufw mm2, [picesp + 12], 0
279
paddw mm1, [pw_2 GOT_ebx]
282
pshufw mm0, mm0, 0 ; dc0 (w)
284
psrlw mm3, 3 ; dc3 (w)
285
psrlw mm2, 2 ; dc2 (w)
286
psrlw mm1, 2 ; dc1 (w)
288
packuswb mm0, mm1 ; dc0,dc1 (b)
289
packuswb mm2, mm3 ; dc2,dc3 (b)
296
;-----------------------------------------------------------------------------
297
; void predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c )
298
;-----------------------------------------------------------------------------
299
cglobal predict_8x8c_p_core_mmxext
303
mov edx, [picesp + 4]
305
pshufw mm0, [picesp + 8], 0
306
pshufw mm2, [picesp +12], 0
307
pshufw mm4, [picesp +16], 0
309
pmullw mm2, [pw_3210 GOT_ebx]
311
paddsw mm0, mm2 ; mm0 = {i+0*b, i+1*b, i+2*b, i+3*b}
312
paddsw mm1, mm0 ; mm1 = {i+4*b, i+5*b, i+6*b, i+7*b}
334
;-----------------------------------------------------------------------------
335
; void predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c )
336
;-----------------------------------------------------------------------------
337
cglobal predict_16x16_p_core_mmxext
341
mov edx, [picesp + 4]
343
pshufw mm0, [picesp + 8], 0
344
pshufw mm2, [picesp +12], 0
345
pshufw mm4, [picesp +16], 0
348
pmullw mm5, [pw_3210 GOT_ebx]
352
paddsw mm0, mm5 ; mm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b}
353
paddsw mm1, mm0 ; mm1 = {i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
354
paddsw mm2, mm0 ; mm2 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b}
355
paddsw mm3, mm1 ; mm3 = {i+12*b, i+13*b, i+14*b, i+15*b}
386
;-----------------------------------------------------------------------------
387
; void predict_16x16_v_mmx( uint8_t *src )
388
;-----------------------------------------------------------------------------
389
cglobal predict_16x16_v_mmx
392
sub edx, ecx ; edx <-- line -1
396
lea eax, [ecx + 2*ecx] ; eax <-- 3* stride
398
SAVE_0_1 (edx + ecx) ; 0
399
SAVE_0_1 (edx + 2 * ecx) ; 1
400
SAVE_0_1 (edx + eax) ; 2
401
SAVE_0_1 (edx + 4 * ecx) ; 3
402
SAVE_0_1 (edx + 2 * eax) ; 5
403
SAVE_0_1 (edx + 8 * ecx) ; 7
404
SAVE_0_1 (edx + 4 * eax) ; 11
405
add edx, ecx ; edx <-- line 0
406
SAVE_0_1 (edx + 4 * ecx) ; 4
407
SAVE_0_1 (edx + 2 * eax) ; 6
408
SAVE_0_1 (edx + 8 * ecx) ; 8
409
SAVE_0_1 (edx + 4 * eax) ; 12
410
lea edx, [edx + 8 * ecx] ; edx <-- line 8
411
SAVE_0_1 (edx + ecx) ; 9
412
SAVE_0_1 (edx + 2 * ecx) ; 10
413
lea edx, [edx + 4 * ecx] ; edx <-- line 12
414
SAVE_0_1 (edx + ecx) ; 13
415
SAVE_0_1 (edx + 2 * ecx) ; 14
416
SAVE_0_1 (edx + eax) ; 15
420
;-----------------------------------------------------------------------------
421
; void predict_16x16_dc_core_mmxext( uint8_t *src, int i_dc_left )
422
;-----------------------------------------------------------------------------
424
%macro PRED16x16_DC 3
427
sub edx, ecx ; edx <-- line -1
432
psadbw mm1, [edx + 8]
434
paddusw mm0, %1 ; FIXME is stack alignment guaranteed?
438
lea eax, [ecx + 2*ecx] ; eax <-- 3* stride
439
packuswb mm0, mm0 ; dc in bytes
444
SAVE_0_0 (edx + ecx) ; 0
445
SAVE_0_0 (edx + 2 * ecx) ; 1
446
SAVE_0_0 (edx + eax) ; 2
447
SAVE_0_0 (edx + 4 * ecx) ; 3
449
lea edx, [edx + 4 * ecx]
455
cglobal predict_16x16_dc_core_mmxext
456
PRED16x16_DC [esp+8], 5, esp
459
cglobal predict_16x16_dc_top_mmxext
462
PRED16x16_DC [pw_8 GOT_ebx], 4, picesp