1
/* transfrm.c, forward / inverse transformation
2
In compiler (gcc) embdeed assembly language... */
4
/* Modifications and enhancements (C) 2000-2003 Andrew Stevens */
6
/* These modifications are free software; you can redistribute it
7
* and/or modify it under the terms of the GNU General Public License
8
* as published by the Free Software Foundation; either version 2 of
9
* the License, or (at your option) any later version.
11
* This program is distributed in the hope that it will be useful,
12
* but WITHOUT ANY WARRANTY; without even the implied warranty of
13
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
* General Public License for more details.
16
* You should have received a copy of the GNU General Public License
17
* along with this program; if not, write to the Free Software
18
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
26
#include "mjpeg_types.h"
27
#include "syntaxparams.h"
28
#include "attributes.h"
32
static __inline__ void
33
mmx_sum_4_word_accs( mmx_t *accs, int32_t *res )
35
movq_m2r( *accs, mm1 );
38
/* Generate sign extensions for mm1 words! */
40
punpcklwd_r2r( mm3, mm1 );
41
punpckhwd_r2r( mm3, mm2 );
42
paddd_r2r( mm1, mm2 );
46
movd_r2m( mm3, *res );
50
static __inline__ void
51
sum_sumsq_8bytes( uint8_t *cur_lum_mb,
62
/* Load pixels from top field into mm1.w,mm2.w
64
movq_m2r( *((mmx_t*)cur_lum_mb), mm1 );
65
movq_m2r( *((mmx_t*)pred_lum_mb), mm2 );
67
/* mm3 := mm1 mm4 := mm2
68
mm1.w[0..3] := mm1.b[0..3]-mm2.b[0..3]
72
punpcklbw_r2r( mm0, mm1 );
74
punpcklbw_r2r( mm0, mm2 );
75
psubw_r2r( mm2, mm1 );
77
/* mm3.w[0..3] := mm3.b[4..7]-mm4.b[4..7]
79
punpckhbw_r2r( mm0, mm3 );
80
punpckhbw_r2r( mm0, mm4 );
81
psubw_r2r( mm4, mm3 );
83
/* sumtop_accs->w[0..3] += mm1.w[0..3];
84
sumtop_accs->w[0..3] += mm3.w[0..3];
87
movq_m2r( *sumtop_accs, mm5 );
88
paddw_r2r( mm1, mm5 );
89
paddw_r2r( mm3, mm5 );
92
movq_r2m( mm5, *sumtop_accs );
95
*sumsq_top_acc += mm1.w[0..3] * mm1.w[0..3];
96
*sumsq_top_acc += mm3.w[0..3] * mm3.w[0..3];
98
pmaddwd_r2r( mm1, mm1 );
99
movq_m2r( *sumsqtop_accs, mm5 );
100
pmaddwd_r2r( mm3, mm3 );
101
paddd_r2r( mm1, mm5 );
102
paddd_r2r( mm3, mm5 );
103
movq_r2m( mm5, *sumsqtop_accs );
106
/* Load pixels from bot field into mm1.w,mm2.w
108
movq_m2r( *((mmx_t*)(cur_lum_mb+opt->phy_width)), mm1 );
109
movq_m2r( *((mmx_t*)(pred_lum_mb+opt->phy_width)), mm2 );
111
/* mm2 := mm1 mm4 := mm2
112
mm1.w[0..3] := mm1.b[0..3]-mm2.b[0..3]
115
movq_r2r( mm1, mm3 );
116
punpcklbw_r2r( mm0, mm1 );
117
movq_r2r( mm2, mm4 );
118
punpcklbw_r2r( mm0, mm2 );
119
psubw_r2r( mm2, mm1 );
121
/* mm3.w[0..3] := mm3.b[4..7]-mm4.b[4..7]
123
punpckhbw_r2r( mm0, mm3 );
124
punpckhbw_r2r( mm0, mm4 );
125
psubw_r2r( mm4, mm3 );
128
sumbot_accs->w[0..3] += mm1.w[0..3];
129
sumbot_accs->w[0..3] += mm3.w[0..3];
130
mm2 := mm1; mm4 := mm3;
132
movq_m2r( *sumbot_accs, mm5 );
133
paddw_r2r( mm1, mm5 );
134
movq_r2r( mm1, mm2 );
135
paddw_r2r( mm3, mm5 );
136
movq_r2r( mm3, mm4 );
137
movq_r2m( mm5, *sumbot_accs );
140
*sumsqbot_acc += mm1.w[0..3] * mm1.w[0..3];
141
*sumsqbot_acc += mm3.w[0..3] * mm3.w[0..3];
143
pmaddwd_r2r( mm1, mm1 );
144
movq_m2r( *sumsqbot_accs, mm5 );
145
pmaddwd_r2r( mm3, mm3 );
146
paddd_r2r( mm1, mm5 );
147
paddd_r2r( mm3, mm5 );
148
movq_r2m( mm5, *sumsqbot_accs );
151
/* Accumulate cross-product
152
*sum_xprod_acc += mm1.w[0..3] * mm6[0..3];
153
*sum_xprod_acc += mm3.w[0..3] * mm7[0..3];
156
movq_m2r( *sumxprod_accs, mm5 );
157
pmaddwd_r2r( mm6, mm2);
158
pmaddwd_r2r( mm7, mm4);
159
paddd_r2r( mm2, mm5 );
160
paddd_r2r( mm4, mm5 );
161
movq_r2m( mm5, *sumxprod_accs );
165
int field_dct_best_mmx( uint8_t *cur_lum_mb, uint8_t *pred_lum_mb)
168
* calculate prediction error (cur-pred) for top (blk0)
169
* and bottom field (blk1)
173
int sumtop, sumbot, sumsqtop, sumsqbot, sumbottop;
177
mmx_t sumtop_accs, sumbot_accs;
178
mmx_t sumsqtop_accs, sumsqbot_accs, sumxprod_accs;
179
int32_t sumtop_acc, sumbot_acc;
180
int32_t sumsqtop_acc, sumsqbot_acc, sumxprod_acc;
183
movq_r2m( mm0, *(&sumtop_accs) );
184
movq_r2m( mm0, *(&sumbot_accs) );
185
movq_r2m( mm0, *(&sumsqtop_accs) );
186
movq_r2m( mm0, *(&sumsqbot_accs) );
187
movq_r2m( mm0, *(&sumxprod_accs) );
189
sumtop = sumsqtop = sumbot = sumsqbot = sumbottop = 0;
190
sumtop_acc = sumbot_acc = sumsqtop_acc = sumsqbot_acc = sumxprod_acc = 0;
196
register int toppix =
197
cur_lum_mb[rowoffs+i] - pred_lum_mb[rowoffs+i];
198
register int botpix =
199
cur_lum_mb[rowoffs+width+i] - pred_lum_mb[rowoffs+width+i];
201
sumsqtop += toppix*toppix;
203
sumsqbot += botpix*botpix;
204
sumbottop += toppix*botpix;
207
sum_sumsq_8bytes( &cur_lum_mb[rowoffs], &pred_lum_mb[rowoffs],
208
&sumtop_accs, &sumbot_accs,
209
&sumsqtop_accs, &sumsqbot_accs, &sumxprod_accs
211
sum_sumsq_8bytes( &cur_lum_mb[rowoffs+8], &pred_lum_mb[rowoffs+8],
212
&sumtop_accs, &sumbot_accs,
213
&sumsqtop_accs, &sumsqbot_accs, &sumxprod_accs );
214
rowoffs += (opt->phy_width<<1);
217
mmx_sum_4_word_accs( &sumtop_accs, &sumtop );
218
mmx_sum_4_word_accs( &sumbot_accs, &sumbot );
220
sumsqtop = sumsqtop_accs.d[0] + sumsqtop_accs.d[1];
221
sumsqbot = sumsqbot_accs.d[0] + sumsqbot_accs.d[1];
222
sumbottop = sumxprod_accs.d[0] + sumxprod_accs.d[1];
224
/* Calculate Variances top and bottom. If they're of similar
225
sign estimate correlation if its good use frame DCT otherwise
229
topvar = sumsqtop-sumtop*sumtop/128;
230
botvar = sumsqbot-sumbot*sumbot/128;
231
if ( !((topvar <= 0) ^ (botvar <= 0)) )
233
d = ((double) topvar) * ((double)botvar);
234
r = (sumbottop-(sumtop*sumbot)/128);
236
return 0; /* frame DCT */
238
return 1; /* field DCT */
241
return 1; /* field DCT */