~ubuntu-branches/ubuntu/hoary/kdemultimedia/hoary

« back to all changes in this revision

Viewing changes to mpeglib/lib/util/render/dither/dither32mmx.cpp

  • Committer: Bazaar Package Importer
  • Author(s): Martin Schulze
  • Date: 2003-01-22 15:00:51 UTC
  • Revision ID: james.westby@ubuntu.com-20030122150051-uihwkdoxf15mi1tn
Tags: upstream-2.2.2
ImportĀ upstreamĀ versionĀ 2.2.2

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
/*
 
2
  MMX ditherer for 32 bit displays
 
3
  Copyright (C) 2000  Martin Vogt
 
4
 
 
5
  This program is free software; you can redistribute it and/or modify
 
6
  it under the terms of the GNU Library General Public License as published by
 
7
  the Free Software Foundation.
 
8
 
 
9
  For more information look at the file COPYRIGHT in this package
 
10
 
 
11
 */
 
12
 
 
13
 
 
14
 
 
15
#include "ditherMMX.h"
 
16
 
 
17
 
 
18
 
 
19
#ifndef INTEL
 
20
  void dither32_mmx(unsigned char* lum,
 
21
                  unsigned char* cr,
 
22
                  unsigned char* cb,
 
23
                  unsigned char* out,
 
24
                  int rows,
 
25
                  int cols,
 
26
                  int mod) {
 
27
  printf("urgs! dither32_mmx \n");
 
28
  printf("never should happen!\n");
 
29
  exit(0);
 
30
}
 
31
 
 
32
#else
 
33
 
 
34
 
 
35
static unsigned long  MMX32_80w[]         = {0x00800080, 0x00800080};
 
36
static unsigned long  MMX32_10w[]         = {0x00100010, 0x00100010};   
 
37
static unsigned long  MMX32_00FFw[]       = {0x00ff00ff, 0x00ff00ff}; 
 
38
static unsigned long  MMX32_FF00w[]       = {0xff00ff00, 0xff00ff00}; 
 
39
static unsigned short MMX32_Ycoeff[]      = {0x4a, 0x4a, 0x4a, 0x4a}; 
 
40
static unsigned short MMX32_Vredcoeff[]   = {0x59, 0x59, 0x59, 0x59};  
 
41
static unsigned short MMX32_Ubluecoeff[]  = {0x72, 0x72, 0x72, 0x72};    
 
42
static unsigned short MMX32_Ugrncoeff[]   = {0xffea,0xffea,0xffea,0xffea}; 
 
43
static unsigned short MMX32_Vgrncoeff[]   = {0xffd2,0xffd2,0xffd2,0xffd2};  
 
44
 
 
45
void dummy_dithermmx32() {
 
46
  cout << "MMX32_10w:"<<MMX32_10w<<endl;
 
47
  cout << "MMX32_80w:"<<MMX32_80w<<endl;
 
48
  cout << "MMX32_Ubluecoeff:"<<MMX32_Ubluecoeff<<endl;
 
49
  cout << "MMX32_Vredcoeff:"<<MMX32_Vredcoeff<<endl;
 
50
  cout << "MMX32_Ugrncoeff:"<<MMX32_Ugrncoeff<<endl;
 
51
  cout << "MMX32_Vgrncoeff:"<<MMX32_Vgrncoeff<<endl;
 
52
  cout << "MMX32_Ycoeff:"<<MMX32_Ycoeff<<endl;
 
53
  cout << "MMX32_00FFw:"<<MMX32_00FFw<<endl;
 
54
  cout << "MMX32_FF00w:"<<MMX32_FF00w<<endl;
 
55
}
 
56
 
 
57
 
 
58
/**
 
59
   This MMX assembler is my first assembler/MMX program ever.
 
60
   Thus it maybe buggy.
 
61
   Send patches to:
 
62
   mvogt@rhrk.uni-kl.de
 
63
 
 
64
   After it worked fine I have "obfuscated" the code a bit to have
 
65
   more parallism in the MMX units. This means I moved
 
66
   initilisation around and delayed other instruction.
 
67
   Performance measurement did not show that this brought any advantage
 
68
   but in theory it _should_ be faster this way.
 
69
 
 
70
   The overall performanve gain to the C based dither was 30%-40%.
 
71
   The MMX routine calculates 256bit=8RGB values in each cycle
 
72
   (4 for row1 & 4 for row2)
 
73
 
 
74
   The red/green/blue.. coefficents are taken from the mpeg_play 
 
75
   player. They look nice, but I dont know if you can have
 
76
   better values, to avoid integer rounding errors.
 
77
   
 
78
 
 
79
   IMPORTANT:
 
80
   ==========
 
81
 
 
82
   It is a requirement that the cr/cb/lum are 8 byte aligned and
 
83
   the out are 16byte aligned or you will/may get segfaults
 
84
 
 
85
*/
 
86
 
 
87
void dither32_mmx(unsigned char* lum,
 
88
                  unsigned char* cr,
 
89
                  unsigned char* cb,
 
90
                  unsigned char* out,
 
91
                  int rows,
 
92
                  int cols,
 
93
                  int mod) {
 
94
 
 
95
 
 
96
    
 
97
    unsigned int *row1;
 
98
    unsigned int *row2;
 
99
    row1 = (unsigned int *)out;           // 32 bit target
 
100
 
 
101
    unsigned char* end = lum +cols*rows;    // Pointer to the end
 
102
    int x=cols;
 
103
    row2=row1+cols+mod;                   // start of second row 
 
104
    mod=4*cols+8*mod;                     // increment for row1 in byte
 
105
 
 
106
    // buffer for asm function
 
107
    int buf[6];
 
108
    buf[0]=(int)(lum+cols);   // lum2 pointer
 
109
    buf[1]=(int)end;
 
110
    buf[2]=x;
 
111
    buf[3]=mod;     
 
112
    buf[4]=0; //tmp0;
 
113
    buf[5]=cols;
 
114
 
 
115
 
 
116
    __asm__ __volatile__ (
 
117
                 ".align 32\n"
 
118
                 "1:\n"
 
119
                
 
120
                 // create Cr (result in mm1)
 
121
                 "movd (%0), %%mm1\n"      //         0  0  0  0  v3 v2 v1 v0
 
122
                 "pxor %%mm7,%%mm7\n"      //         00 00 00 00 00 00 00 00
 
123
                 "movd (%2), %%mm2\n"           //    0  0  0  0 l3 l2 l1 l0
 
124
                 "punpcklbw %%mm7,%%mm1\n" //         0  v3 0  v2 00 v1 00 v0
 
125
                 "punpckldq %%mm1,%%mm1\n" //         00 v1 00 v0 00 v1 00 v0
 
126
                 "psubw MMX32_80w,%%mm1\n"   // mm1-128:r1 r1 r0 r0 r1 r1 r0 r0 
 
127
 
 
128
                 // create Cr_g (result in mm0)
 
129
                 "movq %%mm1,%%mm0\n"           // r1 r1 r0 r0 r1 r1 r0 r0
 
130
                 "pmullw MMX32_Vgrncoeff,%%mm0\n" // red*-46dec=0.7136*64
 
131
                 "pmullw MMX32_Vredcoeff,%%mm1\n" // red*89dec=1.4013*64
 
132
                 "psraw  $6, %%mm0\n"           // red=red/64
 
133
                 "psraw  $6, %%mm1\n"           // red=red/64
 
134
 
 
135
                 
 
136
                 // create L1 L2 (result in mm2,mm4)
 
137
                 // L2=lum2
 
138
                 "movl %2,16%5\n"               // store register in tmp0
 
139
                 "movl %5,%2\n"                 // lum2->register
 
140
                 "movd (%2),%%mm3\n"            //    0  0  0  0 L3 L2 L1 L0
 
141
                 "movl 16%5,%2\n"               // tmp0->register
 
142
                 "punpckldq %%mm3,%%mm2\n"      //   L3 L2 L1 L0 l3 l2 l1 l0
 
143
                 "movq %%mm2,%%mm4\n"           //   L3 L2 L1 L0 l3 l2 l1 l0
 
144
                 "pand MMX32_FF00w, %%mm2\n"      //   L3 0  L1  0 l3  0 l1  0
 
145
                 "pand MMX32_00FFw, %%mm4\n"      //   0  L2  0 L0  0 l2  0 l0
 
146
                 "psrlw $8,%%mm2\n"             //   0  L3  0 L1  0 l3  0 l1
 
147
 
 
148
 
 
149
 
 
150
                 // create R (result in mm6)
 
151
                 "movq %%mm2,%%mm5\n"           //   0 L3  0 L1  0 l3  0 l1
 
152
                 "movq %%mm4,%%mm6\n"           //   0 L2  0 L0  0 l2  0 l0
 
153
                 "paddsw  %%mm1, %%mm5\n"       // lum1+red:x R3 x R1 x r3 x r1
 
154
                 "paddsw  %%mm1, %%mm6\n"       // lum1+red:x R2 x R0 x r2 x r0
 
155
                 "packuswb %%mm5,%%mm5\n"       //  R3 R1 r3 r1 R3 R1 r3 r1
 
156
                 "packuswb %%mm6,%%mm6\n"       //  R2 R0 r2 r0 R2 R0 r2 r0
 
157
                 "pxor %%mm7,%%mm7\n"      //         00 00 00 00 00 00 00 00
 
158
                 "punpcklbw %%mm5,%%mm6\n"      //  R3 R2 R1 R0 r3 r2 r1 r0
 
159
 
 
160
 
 
161
                 // create Cb (result in mm1)
 
162
                 "movd (%1), %%mm1\n"      //         0  0  0  0  u3 u2 u1 u0
 
163
                 "punpcklbw %%mm7,%%mm1\n" //         0  u3 0  u2 00 u1 00 u0
 
164
                 "punpckldq %%mm1,%%mm1\n" //         00 u1 00 u0 00 u1 00 u0
 
165
                 "psubw MMX32_80w,%%mm1\n"   // mm1-128:u1 u1 u0 u0 u1 u1 u0 u0 
 
166
                 // create Cb_g (result in mm5)
 
167
                 "movq %%mm1,%%mm5\n"            // u1 u1 u0 u0 u1 u1 u0 u0
 
168
                 "pmullw MMX32_Ugrncoeff,%%mm5\n"  // blue*-109dec=1.7129*64
 
169
                 "pmullw MMX32_Ubluecoeff,%%mm1\n" // blue*114dec=1.78125*64
 
170
                 "psraw  $6, %%mm5\n"            // blue=red/64
 
171
                 "psraw  $6, %%mm1\n"            // blue=blue/64
 
172
 
 
173
 
 
174
                 // create G (result in mm7)
 
175
                 "movq %%mm2,%%mm3\n"      //   0  L3  0 L1  0 l3  0 l1
 
176
                 "movq %%mm4,%%mm7\n"      //   0  L2  0 L0  0 l2  0 l1
 
177
                 "paddsw  %%mm5, %%mm3\n"  // lum1+Cb_g:x G3t x G1t x g3t x g1t
 
178
                 "paddsw  %%mm5, %%mm7\n"  // lum1+Cb_g:x G2t x G0t x g2t x g0t
 
179
                 "paddsw  %%mm0, %%mm3\n"  // lum1+Cr_g:x G3  x G1  x g3  x g1
 
180
                 "paddsw  %%mm0, %%mm7\n"  // lum1+blue:x G2  x G0  x g2  x g0
 
181
                 "packuswb %%mm3,%%mm3\n"  // G3 G1 g3 g1 G3 G1 g3 g1
 
182
                 "packuswb %%mm7,%%mm7\n"  // G2 G0 g2 g0 G2 G0 g2 g0
 
183
                 "punpcklbw %%mm3,%%mm7\n" // G3 G2 G1 G0 g3 g2 g1 g0
 
184
                 
 
185
 
 
186
                 // create B (result in mm5)
 
187
                 "movq %%mm2,%%mm3\n"         //   0  L3  0 L1  0 l3  0 l1
 
188
                 "movq %%mm4,%%mm5\n"         //   0  L2  0 L0  0 l2  0 l1
 
189
                 "paddsw  %%mm1, %%mm3\n"     // lum1+blue:x B3 x B1 x b3 x b1
 
190
                 "paddsw  %%mm1, %%mm5\n"     // lum1+blue:x B2 x B0 x b2 x b0
 
191
                 "packuswb %%mm3,%%mm3\n"     // B3 B1 b3 b1 B3 B1 b3 b1
 
192
                 "packuswb %%mm5,%%mm5\n"     // B2 B0 b2 b0 B2 B0 b2 b0
 
193
                 "punpcklbw %%mm3,%%mm5\n"    // B3 B2 B1 B0 b3 b2 b1 b0
 
194
 
 
195
 
 
196
                 // fill destination row1 (needed are mm6=Rr,mm7=Gg,mm5=Bb)
 
197
 
 
198
                 "pxor %%mm2,%%mm2\n"           //  0  0  0  0  0  0  0  0
 
199
                 "pxor %%mm4,%%mm4\n"           //  0  0  0  0  0  0  0  0
 
200
                 "movq %%mm6,%%mm1\n"           // R3 R2 R1 R0 r3 r2 r1 r0
 
201
                 "movq %%mm5,%%mm3\n"           // B3 B2 B1 B0 b3 b2 b1 b0
 
202
                 // process lower lum
 
203
                 "punpcklbw %%mm4,%%mm1\n"      //  0 r3  0 r2  0 r1  0 r0
 
204
                 "punpcklbw %%mm4,%%mm3\n"      //  0 b3  0 b2  0 b1  0 b0
 
205
                 "movq %%mm1,%%mm2\n"           //  0 r3  0 r2  0 r1  0 r0
 
206
                 "movq %%mm3,%%mm0\n"           //  0 b3  0 b2  0 b1  0 b0
 
207
                 "punpcklwd %%mm1,%%mm3\n"      //  0 r1  0 b1  0 r0  0 b0
 
208
                 "punpckhwd %%mm2,%%mm0\n"      //  0 r3  0 b3  0 r2  0 b2
 
209
 
 
210
                 "pxor %%mm2,%%mm2\n"           //  0  0  0  0  0  0  0  0
 
211
                 "movq %%mm7,%%mm1\n"           // G3 G2 G1 G0 g3 g2 g1 g0
 
212
                 "punpcklbw %%mm1,%%mm2\n"      // g3  0 g2  0 g1  0 g0  0
 
213
                 "punpcklwd %%mm4,%%mm2\n"      //  0  0 g1  0  0  0 g0  0 
 
214
                 "por  %%mm3, %%mm2\n"      //  0 r1 g1 b1  0 r0 g0 b0
 
215
                 "movq   %%mm2,(%3)\n"          // wrote out ! row1
 
216
 
 
217
                 "pxor %%mm2,%%mm2\n"           //  0  0  0  0  0  0  0  0
 
218
                 "punpcklbw %%mm1,%%mm4\n"      // g3  0 g2  0 g1  0 g0  0
 
219
                 "punpckhwd %%mm2,%%mm4\n"      //  0  0 g3  0  0  0 g2  0 
 
220
                 "por  %%mm0, %%mm4\n"      //  0 r3 g3 b3  0 r2 g2 b2
 
221
                 "movq   %%mm4,8(%3)\n"         // wrote out ! row1
 
222
                 
 
223
                 // fill destination row2 (needed are mm6=Rr,mm7=Gg,mm5=Bb)
 
224
                 // this can be done "destructive"
 
225
                 "pxor %%mm2,%%mm2\n"           //  0  0  0  0  0  0  0  0
 
226
                 "punpckhbw %%mm2,%%mm6\n"      //  0 R3  0 R2  0 R1  0 R0
 
227
                 "punpckhbw %%mm1,%%mm5\n"      // G3 B3 G2 B2 G1 B1 G0 B0
 
228
                 "movq %%mm5,%%mm1\n"           // G3 B3 G2 B2 G1 B1 G0 B0
 
229
                 "punpcklwd %%mm6,%%mm1\n"      //  0 R1 G1 B1  0 R0 G0 B0
 
230
                 "movq   %%mm1,(%4)\n"          // wrote out ! row2
 
231
                 "punpckhwd %%mm6,%%mm5\n"      //  0 R3 G3 B3  0 R2 G2 B2
 
232
                 "movq   %%mm5,8(%4)\n"         // wrote out ! row2
 
233
                 
 
234
                 "addl  $4,%2\n"            // lum+4
 
235
                 "addl  $4,%5\n"            // lum2+4
 
236
                 "leal  16(%3),%3\n"        // row1+16
 
237
                 "leal  16(%4),%4\n"        // row2+16
 
238
                 "addl  $2, %0\n"           // cr+2
 
239
                 "addl  $2, %1\n"           // cb+2
 
240
 
 
241
                 "subl  $4,8%5\n"           // x+4 x is buf[2]
 
242
                 "cmpl  $0,8%5\n"
 
243
 
 
244
                 "jne   1b\n"
 
245
                 "addl           20%5,   %2\n" // lum  += cols 
 
246
                 "movl %2,16%5\n"              // store register in tmp0
 
247
                 "movl 20%5,%2\n"              // cols->register
 
248
 
 
249
                 "addl           %2,     %5\n" // lum2 += cols 
 
250
                 "addl           12%5,   %3\n" // row1+= mod is buf[0]
 
251
                 "addl           12%5,   %4\n" // row2+= mod is buf[0]
 
252
 
 
253
                 "movl %2, 8%5\n"              // x=cols
 
254
                 "movl 16%5,%2\n"              // store tmp0 in register
 
255
 
 
256
                 "cmpl           4%5,    %2\n"  // buf[1] is end
 
257
                 "jl             1b\n"
 
258
                 "emms\n"
 
259
                 :
 
260
                 : "r" (cr), "r"(cb),"r"(lum),
 
261
                 "r"(row1),"r"(row2),"m"(buf[0])
 
262
                 );
 
263
 
 
264
 
 
265
 
 
266
}
 
267
 
 
268
 
 
269
#endif