2
/* pnggccrd.c - mixed C/assembler version of utilities to read a PNG file
4
* For Intel x86 CPU (Pentium-MMX or later) and GNU C compiler.
6
* See http://www.intel.com/drg/pentiumII/appnotes/916/916.htm
7
* and http://www.intel.com/drg/pentiumII/appnotes/923/923.htm
8
* for Intel's performance analysis of the MMX vs. non-MMX code.
10
* Last changed in libpng 1.2.15 January 5, 2007
11
* For conditions of distribution and use, see copyright notice in png.h
12
* Copyright (c) 1998-2007 Glenn Randers-Pehrson
13
* Copyright (c) 1998, Intel Corporation
15
* Based on MSVC code contributed by Nirav Chhatrapati, Intel Corp., 1998.
16
* Interface to libpng contributed by Gilles Vollant, 1999.
17
* GNU C port by Greg Roelofs, 1999-2001.
19
* Lines 2350-4300 converted in place with intel2gas 1.3.1:
21
* intel2gas -mdI pnggccrd.c.partially-msvc -o pnggccrd.c
23
* and then cleaned up by hand. See http://hermes.terminal.at/intel2gas/ .
25
* NOTE: A sufficiently recent version of GNU as (or as.exe under DOS/Windows)
26
* is required to assemble the newer MMX instructions such as movq.
29
* ftp://ftp.simtel.net/pub/simtelnet/gnu/djgpp/v2gnu/bnu281b.zip
31
* (or a later version in the same directory). For Linux, check your
32
* distribution's web site(s) or try these links:
34
* http://rufus.w3.org/linux/RPM/binutils.html
35
* http://www.debian.org/Packages/stable/devel/binutils.html
36
* ftp://ftp.slackware.com/pub/linux/slackware/slackware/slakware/d1/
39
* For other platforms, see the main GNU site:
41
* ftp://ftp.gnu.org/pub/gnu/binutils/
43
* Version 2.5.2l.15 is definitely too old...
47
* TEMPORARY PORTING NOTES AND CHANGELOG (mostly by Greg Roelofs)
48
* =====================================
51
* - fixed sign error in post-MMX cleanup code (16- & 32-bit cases)
54
* - additional optimizations (possible or definite):
55
* x [DONE] write MMX code for 64-bit case (pixel_bytes == 8) [not tested]
56
* - write MMX code for 48-bit case (pixel_bytes == 6)
57
* - figure out what's up with 24-bit case (pixel_bytes == 3):
58
* why subtract 8 from width_mmx in the pass 4/5 case?
59
* (only width_mmx case) (near line 1606)
60
* x [DONE] replace pixel_bytes within each block with the true
61
* constant value (or are compilers smart enough to do that?)
62
* - rewrite all MMX interlacing code so it's aligned with
63
* the *beginning* of the row buffer, not the end. This
64
* would not only allow one to eliminate half of the memory
65
* writes for odd passes (that is, pass == odd), it may also
66
* eliminate some unaligned-data-access exceptions (assuming
67
* there's a penalty for not aligning 64-bit accesses on
68
* 64-bit boundaries). The only catch is that the "leftover"
69
* pixel(s) at the end of the row would have to be saved,
70
* but there are enough unused MMX registers in every case,
71
* so this is not a problem. A further benefit is that the
72
* post-MMX cleanup code (C code) in at least some of the
73
* cases could be done within the assembler block.
74
* x [DONE] the "v3 v2 v1 v0 v7 v6 v5 v4" comments are confusing,
75
* inconsistent, and don't match the MMX Programmer's Reference
76
* Manual conventions anyway. They should be changed to
77
* "b7 b6 b5 b4 b3 b2 b1 b0," where b0 indicates the byte that
78
* was lowest in memory (e.g., corresponding to a left pixel)
79
* and b7 is the byte that was highest (e.g., a right pixel).
82
* - Brennan's Guide notwithstanding, gcc under Linux does *not*
83
* want globals prefixed by underscores when referencing them--
84
* i.e., if the variable is const4, then refer to it as const4,
85
* not _const4. This seems to be a djgpp-specific requirement.
86
* Also, such variables apparently *must* be declared outside
87
* of functions; neither static nor automatic variables work if
88
* defined within the scope of a single function, but both
89
* static and truly global (multi-module) variables work fine.
92
* - fixed png_combine_row() non-MMX replication bug (odd passes only?)
93
* - switched from string-concatenation-with-macros to cleaner method of
94
* renaming global variables for djgpp--i.e., always use prefixes in
95
* inlined assembler code (== strings) and conditionally rename the
96
* variables, not the other way around. Hence _const4, _mask8_0, etc.
99
* - fixed mmxsupport()/png_do_read_interlace() first-row bug
100
* This one was severely weird: even though mmxsupport() doesn't touch
101
* ebx (where "row" pointer was stored), it nevertheless managed to zero
102
* the register (even in static/non-fPIC code--see below), which in turn
103
* caused png_do_read_interlace() to return prematurely on the first row of
104
* interlaced images (i.e., without expanding the interlaced pixels).
105
* Inspection of the generated assembly code didn't turn up any clues,
106
* although it did point at a minor optimization (i.e., get rid of
107
* mmx_supported_local variable and just use eax). Possibly the CPUID
108
* instruction is more destructive than it looks? (Not yet checked.)
109
* - "info gcc" was next to useless, so compared fPIC and non-fPIC assembly
110
* listings... Apparently register spillage has to do with ebx, since
111
* it's used to index the global offset table. Commenting it out of the
112
* input-reg lists in png_combine_row() eliminated compiler barfage, so
113
* ifdef'd with __PIC__ macro: if defined, use a global for unmask
116
* - verified CPUID clobberage: 12-char string constant ("GenuineIntel",
117
* "AuthenticAMD", etc.) placed in ebx:ecx:edx. Still need to polish.
120
* - made "diff" variable (now "_dif") global to simplify conversion of
121
* filtering routines (running out of regs, sigh). "diff" is still used
122
* in interlacing routines, however.
123
* - fixed up both versions of mmxsupport() (ORIG_THAT_USED_TO_CLOBBER_EBX
124
* macro determines which is used); original not yet tested.
127
* - when compiling with gcc, be sure to use -fomit-frame-pointer
130
* - fixed a register-name typo in png_do_read_interlace(), default (MMX) case,
131
* pass == 4 or 5, that caused visible corruption of interlaced images
134
* - Various problems were reported with gcc 2.95.2 in the Cygwin environment,
135
* many of the form "forbidden register 0 (ax) was spilled for class AREG."
136
* This is explained at http://gcc.gnu.org/fom_serv/cache/23.html, and
137
* Chuck Wilson supplied a patch involving dummy output registers. See
138
* http://sourceforge.net/bugs/?func=detailbug&bug_id=108741&group_id=5624
139
* for the original (anonymous) SourceForge bug report.
142
* - Chuck Wilson passed along these remaining gcc 2.95.2 errors:
143
* pnggccrd.c: In function `png_combine_row':
144
* pnggccrd.c:525: more than 10 operands in `asm'
145
* pnggccrd.c:669: more than 10 operands in `asm'
146
* pnggccrd.c:828: more than 10 operands in `asm'
147
* pnggccrd.c:994: more than 10 operands in `asm'
148
* pnggccrd.c:1177: more than 10 operands in `asm'
149
* They are all the same problem and can be worked around by using the
150
* global _unmask variable unconditionally, not just in the -fPIC case.
151
* Reportedly earlier versions of gcc also have the problem with more than
152
* 10 operands; they just don't report it. Much strangeness ensues, etc.
155
* - enabled png_read_filter_row_mmx_up() (shortest remaining unconverted
156
* MMX routine); began converting png_read_filter_row_mmx_sub()
157
* - to finish remaining sections:
158
* - clean up indentation and comments
159
* - preload local variables
160
* - add output and input regs (order of former determines numerical
162
* - avoid all usage of ebx (including bx, bh, bl) register [20000823]
163
* - remove "$" from addressing of Shift and Mask variables [20000823]
166
* - global union vars causing segfaults in png_read_filter_row_mmx_sub()?
169
* - ARGH, stupid png_read_filter_row_mmx_sub() segfault only happens with
170
* shared-library (-fPIC) version! Code works just fine as part of static
171
* library. Damn damn damn damn damn, should have tested that sooner.
172
* ebx is getting clobbered again (explicitly this time); need to save it
173
* on stack or rewrite asm code to avoid using it altogether. Blargh!
176
* - first section was trickiest; all remaining sections have ebx -> edx now.
177
* (-fPIC works again.) Also added missing underscores to various Shift*
178
* and *Mask* globals and got rid of leading "$" signs.
181
* - added visual separators to help navigate microscopic printed copies
182
* (http://pobox.com/~newt/code/gpr-latest.zip, mode 10); started working
183
* on png_read_filter_row_mmx_avg()
186
* - finished png_read_filter_row_mmx_avg(): only Paeth left! (930 lines...)
187
* What the hell, did png_read_filter_row_mmx_paeth(), too. Comments not
188
* cleaned up/shortened in either routine, but functionality is complete
189
* and seems to be working fine.
192
* - ahhh, figured out last(?) bit of gcc/gas asm-fu: if register is listed
193
* as an input reg (with dummy output variables, etc.), then it *cannot*
194
* also appear in the clobber list or gcc 2.95.2 will barf. The solution
195
* is simple enough...
198
* - bug in png_read_filter_row_mmx_avg(): 16-bit grayscale not handled
199
* correctly (but 48-bit RGB just fine)
202
* - fixed bug in png_read_filter_row_mmx_avg(), bpp == 2 case; three errors:
203
* - "_ShiftBpp.use = 24;" should have been "_ShiftBpp.use = 16;"
204
* - "_ShiftRem.use = 40;" should have been "_ShiftRem.use = 48;"
205
* - "psllq _ShiftRem, %%mm2" should have been "psrlq _ShiftRem, %%mm2"
208
* - added new png_init_mmx_flags() function (here only because it needs to
209
* call mmxsupport(), which should probably become global png_mmxsupport());
210
* modified other MMX routines to run conditionally (png_ptr->asm_flags)
213
* - renamed mmxsupport() to png_mmx_support(), with auto-set of mmx_supported,
214
* and made it public; moved png_init_mmx_flags() to png.c as internal func
217
* - removed dependency on png_read_filter_row_c() (C code already duplicated
218
* within MMX version of png_read_filter_row()) so no longer necessary to
219
* compile it into pngrutil.o
222
* - fixed buffer-overrun bug in png_combine_row() C code (non-MMX)
225
* - eliminated incorrect use of width_mmx in pixel_bytes == 8 case
228
* - more tinkering with clobber list at lines 4529 and 5033, to get
229
* it to compile on gcc-3.4.
232
* - test png_do_read_interlace() 64-bit case (pixel_bytes == 8)
233
* - write MMX code for 48-bit case (pixel_bytes == 6)
234
* - figure out what's up with 24-bit case (pixel_bytes == 3):
235
* why subtract 8 from width_mmx in the pass 4/5 case?
236
* (only width_mmx case) (near line 1606)
237
* - rewrite all MMX interlacing code so it's aligned with beginning
238
* of the row buffer, not the end (see 19991007 for details)
239
* x pick one version of mmxsupport() and get rid of the other
240
* - add error messages to any remaining bogus default cases
241
* - enable pixel_depth == 8 cases in png_read_filter_row()? (test speed)
242
* x add support for runtime enable/disable/query of various MMX routines
248
#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_USE_PNGGCCRD)
250
int PNGAPI png_mmx_support(void);
252
#ifdef PNG_USE_LOCAL_ARRAYS
253
const static int FARDATA png_pass_start[7] = {0, 4, 0, 2, 0, 1, 0};
254
const static int FARDATA png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
255
const static int FARDATA png_pass_width[7] = {8, 4, 4, 2, 2, 1, 1};
258
#if defined(PNG_MMX_CODE_SUPPORTED)
259
/* djgpp, Win32, Cygwin, and OS2 add their own underscores to global variables,
260
* so define them without: */
261
#if defined(__DJGPP__) || defined(WIN32) || defined(__CYGWIN__) || \
263
# define _mmx_supported mmx_supported
264
# define _const4 const4
265
# define _const6 const6
266
# define _mask8_0 mask8_0
267
# define _mask16_1 mask16_1
268
# define _mask16_0 mask16_0
269
# define _mask24_2 mask24_2
270
# define _mask24_1 mask24_1
271
# define _mask24_0 mask24_0
272
# define _mask32_3 mask32_3
273
# define _mask32_2 mask32_2
274
# define _mask32_1 mask32_1
275
# define _mask32_0 mask32_0
276
# define _mask48_5 mask48_5
277
# define _mask48_4 mask48_4
278
# define _mask48_3 mask48_3
279
# define _mask48_2 mask48_2
280
# define _mask48_1 mask48_1
281
# define _mask48_0 mask48_0
282
# define _LBCarryMask LBCarryMask
283
# define _HBClearMask HBClearMask
284
# define _ActiveMask ActiveMask
285
# define _ActiveMask2 ActiveMask2
286
# define _ActiveMaskEnd ActiveMaskEnd
287
# define _ShiftBpp ShiftBpp
288
# define _ShiftRem ShiftRem
289
#ifdef PNG_THREAD_UNSAFE_OK
290
# define _unmask unmask
291
# define _FullLength FullLength
292
# define _MMXLength MMXLength
294
# define _patemp patemp
295
# define _pbtemp pbtemp
296
# define _pctemp pctemp
301
/* These constants are used in the inlined MMX assembly code.
302
Ignore gcc's "At top level: defined but not used" warnings. */
304
/* GRR 20000706: originally _unmask was needed only when compiling with -fPIC,
305
* since that case uses the %ebx register for indexing the Global Offset Table
306
* and there were no other registers available. But gcc 2.95 and later emit
307
* "more than 10 operands in `asm'" errors when %ebx is used to preload unmask
308
* in the non-PIC case, so we'll just use the global unconditionally now.
310
#ifdef PNG_THREAD_UNSAFE_OK
314
const static unsigned long long _mask8_0 = 0x0102040810204080LL;
316
const static unsigned long long _mask16_1 = 0x0101020204040808LL;
317
const static unsigned long long _mask16_0 = 0x1010202040408080LL;
319
const static unsigned long long _mask24_2 = 0x0101010202020404LL;
320
const static unsigned long long _mask24_1 = 0x0408080810101020LL;
321
const static unsigned long long _mask24_0 = 0x2020404040808080LL;
323
const static unsigned long long _mask32_3 = 0x0101010102020202LL;
324
const static unsigned long long _mask32_2 = 0x0404040408080808LL;
325
const static unsigned long long _mask32_1 = 0x1010101020202020LL;
326
const static unsigned long long _mask32_0 = 0x4040404080808080LL;
328
const static unsigned long long _mask48_5 = 0x0101010101010202LL;
329
const static unsigned long long _mask48_4 = 0x0202020204040404LL;
330
const static unsigned long long _mask48_3 = 0x0404080808080808LL;
331
const static unsigned long long _mask48_2 = 0x1010101010102020LL;
332
const static unsigned long long _mask48_1 = 0x2020202040404040LL;
333
const static unsigned long long _mask48_0 = 0x4040808080808080LL;
335
const static unsigned long long _const4 = 0x0000000000FFFFFFLL;
336
//const static unsigned long long _const5 = 0x000000FFFFFF0000LL; // NOT USED
337
const static unsigned long long _const6 = 0x00000000000000FFLL;
339
// These are used in the row-filter routines and should/would be local
340
// variables if not for gcc addressing limitations.
341
// WARNING: Their presence probably defeats the thread safety of libpng.
343
#ifdef PNG_THREAD_UNSAFE_OK
344
static png_uint_32 _FullLength;
345
static png_uint_32 _MMXLength;
347
static int _patemp; // temp variables for Paeth routine
353
png_squelch_warnings(void)
355
#ifdef PNG_THREAD_UNSAFE_OK
360
_MMXLength = _MMXLength;
365
_mask16_1 = _mask16_1;
366
_mask16_0 = _mask16_0;
367
_mask24_2 = _mask24_2;
368
_mask24_1 = _mask24_1;
369
_mask24_0 = _mask24_0;
370
_mask32_3 = _mask32_3;
371
_mask32_2 = _mask32_2;
372
_mask32_1 = _mask32_1;
373
_mask32_0 = _mask32_0;
374
_mask48_5 = _mask48_5;
375
_mask48_4 = _mask48_4;
376
_mask48_3 = _mask48_3;
377
_mask48_2 = _mask48_2;
378
_mask48_1 = _mask48_1;
379
_mask48_0 = _mask48_0;
381
#endif /* PNG_MMX_CODE_SUPPORTED */
384
static int _mmx_supported = 2;
386
/*===========================================================================*/
388
/* P N G _ C O M B I N E _ R O W */
390
/*===========================================================================*/
392
#if defined(PNG_HAVE_MMX_COMBINE_ROW)
395
#define BPP3 3 /* bytes per pixel (a.k.a. pixel_bytes) */
397
#define BPP6 6 /* (defined only to help avoid cut-and-paste errors) */
400
/* Combines the row recently read in with the previous row.
401
This routine takes care of alpha and transparency if requested.
402
This routine also handles the two methods of progressive display
403
of interlaced images, depending on the mask value.
404
The mask value describes which pixels are to be combined with
405
the row. The pattern always repeats every 8 pixels, so just 8
406
bits are needed. A one indicates the pixel is to be combined; a
407
zero indicates the pixel is to be skipped. This is in addition
408
to any alpha or transparency value associated with the pixel.
409
If you want all pixels to be combined, pass 0xff (255) in mask. */
411
/* Use this routine for the x86 platform - it uses a faster MMX routine
412
if the machine supports MMX. */
415
png_combine_row(png_structp png_ptr, png_bytep row, int mask)
417
png_debug(1, "in png_combine_row (pnggccrd.c)\n");
419
#if defined(PNG_MMX_CODE_SUPPORTED)
420
if (_mmx_supported == 2) {
421
#if !defined(PNG_1_0_X)
422
/* this should have happened in png_init_mmx_flags() already */
423
png_warning(png_ptr, "asm_flags may not have been initialized");
431
png_debug(2,"mask == 0xff: doing single png_memcpy()\n");
432
png_memcpy(row, png_ptr->row_buf + 1,
433
(png_size_t)PNG_ROWBYTES(png_ptr->row_info.pixel_depth,png_ptr->width));
435
else /* (png_combine_row() is never called with mask == 0) */
437
switch (png_ptr->row_info.pixel_depth)
439
case 1: /* png_ptr->row_info.pixel_depth */
443
int s_inc, s_start, s_end;
448
sp = png_ptr->row_buf + 1;
451
#if defined(PNG_READ_PACKSWAP_SUPPORTED)
452
if (png_ptr->transformations & PNG_PACKSWAP)
468
for (i = 0; i < png_ptr->width; i++)
474
value = (*sp >> shift) & 0x1;
475
*dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
476
*dp |= (png_byte)(value << shift);
496
case 2: /* png_ptr->row_info.pixel_depth */
500
int s_start, s_end, s_inc;
506
sp = png_ptr->row_buf + 1;
509
#if defined(PNG_READ_PACKSWAP_SUPPORTED)
510
if (png_ptr->transformations & PNG_PACKSWAP)
526
for (i = 0; i < png_ptr->width; i++)
530
value = (*sp >> shift) & 0x3;
531
*dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
532
*dp |= (png_byte)(value << shift);
551
case 4: /* png_ptr->row_info.pixel_depth */
555
int s_start, s_end, s_inc;
561
sp = png_ptr->row_buf + 1;
564
#if defined(PNG_READ_PACKSWAP_SUPPORTED)
565
if (png_ptr->transformations & PNG_PACKSWAP)
580
for (i = 0; i < png_ptr->width; i++)
584
value = (*sp >> shift) & 0xf;
585
*dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
586
*dp |= (png_byte)(value << shift);
605
case 8: /* png_ptr->row_info.pixel_depth */
610
#if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
611
#if !defined(PNG_1_0_X)
612
if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
613
/* && _mmx_supported */ )
620
int dummy_value_a; // fix 'forbidden register spilled' error
625
_unmask = ~mask; // global variable for -fPIC version
626
srcptr = png_ptr->row_buf + 1;
628
len = png_ptr->width &~7; // reduce to multiple of 8
629
diff = (int) (png_ptr->width & 7); // amount lost
631
__asm__ __volatile__ (
632
"movd _unmask, %%mm7 \n\t" // load bit pattern
633
"psubb %%mm6, %%mm6 \n\t" // zero mm6
634
"punpcklbw %%mm7, %%mm7 \n\t"
635
"punpcklwd %%mm7, %%mm7 \n\t"
636
"punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
638
"movq _mask8_0, %%mm0 \n\t"
639
"pand %%mm7, %%mm0 \n\t" // nonzero if keep byte
640
"pcmpeqb %%mm6, %%mm0 \n\t" // zeros->1s, v versa
642
// preload "movl len, %%ecx \n\t" // load length of line
643
// preload "movl srcptr, %%esi \n\t" // load source
644
// preload "movl dstptr, %%edi \n\t" // load dest
646
"cmpl $0, %%ecx \n\t" // len == 0 ?
647
"je mainloop8end \n\t"
650
"movq (%%esi), %%mm4 \n\t" // *srcptr
651
"pand %%mm0, %%mm4 \n\t"
652
"movq %%mm0, %%mm6 \n\t"
653
"pandn (%%edi), %%mm6 \n\t" // *dstptr
654
"por %%mm6, %%mm4 \n\t"
655
"movq %%mm4, (%%edi) \n\t"
656
"addl $8, %%esi \n\t" // inc by 8 bytes processed
657
"addl $8, %%edi \n\t"
658
"subl $8, %%ecx \n\t" // dec by 8 pixels processed
662
// preload "movl diff, %%ecx \n\t" // (diff is in eax)
663
"movl %%eax, %%ecx \n\t"
664
"cmpl $0, %%ecx \n\t"
666
// preload "movl mask, %%edx \n\t"
667
"sall $24, %%edx \n\t" // make low byte, high byte
670
"sall %%edx \n\t" // move high bit to CF
671
"jnc skip8 \n\t" // if CF = 0
672
"movb (%%esi), %%al \n\t"
673
"movb %%al, (%%edi) \n\t"
679
"jnz secondloop8 \n\t"
684
: "=a" (dummy_value_a), // output regs (dummy)
685
"=d" (dummy_value_d),
686
"=c" (dummy_value_c),
687
"=S" (dummy_value_S),
690
: "3" (srcptr), // esi // input regs
693
// was (unmask) "b" RESERVED // ebx // Global Offset Table idx
697
#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
698
: "%mm0", "%mm4", "%mm6", "%mm7" // clobber list
702
else /* mmx _not supported - Use modified C routine */
703
#endif /* PNG_MMX_CODE_SUPPORTED */
705
register png_uint_32 i;
706
png_uint_32 initial_val = png_pass_start[png_ptr->pass];
707
/* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
708
register int stride = png_pass_inc[png_ptr->pass];
709
/* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
710
register int rep_bytes = png_pass_width[png_ptr->pass];
711
/* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
712
png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
713
int diff = (int) (png_ptr->width & 7); /* amount lost */
714
register png_uint_32 final_val = len; /* GRR bugfix */
716
srcptr = png_ptr->row_buf + 1 + initial_val;
717
dstptr = row + initial_val;
719
for (i = initial_val; i < final_val; i += stride)
721
png_memcpy(dstptr, srcptr, rep_bytes);
725
if (diff) /* number of leftover pixels: 3 for pngtest */
727
final_val+=diff /* *BPP1 */ ;
728
for (; i < final_val; i += stride)
730
if (rep_bytes > (int)(final_val-i))
731
rep_bytes = (int)(final_val-i);
732
png_memcpy(dstptr, srcptr, rep_bytes);
738
} /* end of else (_mmx_supported) */
743
case 16: /* png_ptr->row_info.pixel_depth */
748
#if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
749
#if !defined(PNG_1_0_X)
750
if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
751
/* && _mmx_supported */ )
758
int dummy_value_a; // fix 'forbidden register spilled' error
763
_unmask = ~mask; // global variable for -fPIC version
764
srcptr = png_ptr->row_buf + 1;
766
len = png_ptr->width &~7; // reduce to multiple of 8
767
diff = (int) (png_ptr->width & 7); // amount lost //
769
__asm__ __volatile__ (
770
"movd _unmask, %%mm7 \n\t" // load bit pattern
771
"psubb %%mm6, %%mm6 \n\t" // zero mm6
772
"punpcklbw %%mm7, %%mm7 \n\t"
773
"punpcklwd %%mm7, %%mm7 \n\t"
774
"punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
776
"movq _mask16_0, %%mm0 \n\t"
777
"movq _mask16_1, %%mm1 \n\t"
779
"pand %%mm7, %%mm0 \n\t"
780
"pand %%mm7, %%mm1 \n\t"
782
"pcmpeqb %%mm6, %%mm0 \n\t"
783
"pcmpeqb %%mm6, %%mm1 \n\t"
785
// preload "movl len, %%ecx \n\t" // load length of line
786
// preload "movl srcptr, %%esi \n\t" // load source
787
// preload "movl dstptr, %%edi \n\t" // load dest
789
"cmpl $0, %%ecx \n\t"
790
"jz mainloop16end \n\t"
793
"movq (%%esi), %%mm4 \n\t"
794
"pand %%mm0, %%mm4 \n\t"
795
"movq %%mm0, %%mm6 \n\t"
796
"movq (%%edi), %%mm7 \n\t"
797
"pandn %%mm7, %%mm6 \n\t"
798
"por %%mm6, %%mm4 \n\t"
799
"movq %%mm4, (%%edi) \n\t"
801
"movq 8(%%esi), %%mm5 \n\t"
802
"pand %%mm1, %%mm5 \n\t"
803
"movq %%mm1, %%mm7 \n\t"
804
"movq 8(%%edi), %%mm6 \n\t"
805
"pandn %%mm6, %%mm7 \n\t"
806
"por %%mm7, %%mm5 \n\t"
807
"movq %%mm5, 8(%%edi) \n\t"
809
"addl $16, %%esi \n\t" // inc by 16 bytes processed
810
"addl $16, %%edi \n\t"
811
"subl $8, %%ecx \n\t" // dec by 8 pixels processed
814
"mainloop16end: \n\t"
815
// preload "movl diff, %%ecx \n\t" // (diff is in eax)
816
"movl %%eax, %%ecx \n\t"
817
"cmpl $0, %%ecx \n\t"
819
// preload "movl mask, %%edx \n\t"
820
"sall $24, %%edx \n\t" // make low byte, high byte
823
"sall %%edx \n\t" // move high bit to CF
824
"jnc skip16 \n\t" // if CF = 0
825
"movw (%%esi), %%ax \n\t"
826
"movw %%ax, (%%edi) \n\t"
829
"addl $2, %%esi \n\t"
830
"addl $2, %%edi \n\t"
832
"jnz secondloop16 \n\t"
837
: "=a" (dummy_value_a), // output regs (dummy)
838
"=c" (dummy_value_c),
839
"=d" (dummy_value_d),
840
"=S" (dummy_value_S),
843
: "0" (diff), // eax // input regs
844
// was (unmask) " " RESERVED // ebx // Global Offset Table idx
850
#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
851
: "%mm0", "%mm1", "%mm4" // clobber list
852
, "%mm5", "%mm6", "%mm7"
856
else /* mmx _not supported - Use modified C routine */
857
#endif /* PNG_MMX_CODE_SUPPORTED */
859
register png_uint_32 i;
860
png_uint_32 initial_val = BPP2 * png_pass_start[png_ptr->pass];
861
/* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
862
register int stride = BPP2 * png_pass_inc[png_ptr->pass];
863
/* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
864
register int rep_bytes = BPP2 * png_pass_width[png_ptr->pass];
865
/* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
866
png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
867
int diff = (int) (png_ptr->width & 7); /* amount lost */
868
register png_uint_32 final_val = BPP2 * len; /* GRR bugfix */
870
srcptr = png_ptr->row_buf + 1 + initial_val;
871
dstptr = row + initial_val;
873
for (i = initial_val; i < final_val; i += stride)
875
png_memcpy(dstptr, srcptr, rep_bytes);
879
if (diff) /* number of leftover pixels: 3 for pngtest */
881
final_val+=diff*BPP2;
882
for (; i < final_val; i += stride)
884
if (rep_bytes > (int)(final_val-i))
885
rep_bytes = (int)(final_val-i);
886
png_memcpy(dstptr, srcptr, rep_bytes);
891
} /* end of else (_mmx_supported) */
896
case 24: /* png_ptr->row_info.pixel_depth */
901
#if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
902
#if !defined(PNG_1_0_X)
903
if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
904
/* && _mmx_supported */ )
911
int dummy_value_a; // fix 'forbidden register spilled' error
916
_unmask = ~mask; // global variable for -fPIC version
917
srcptr = png_ptr->row_buf + 1;
919
len = png_ptr->width &~7; // reduce to multiple of 8
920
diff = (int) (png_ptr->width & 7); // amount lost //
922
__asm__ __volatile__ (
923
"movd _unmask, %%mm7 \n\t" // load bit pattern
924
"psubb %%mm6, %%mm6 \n\t" // zero mm6
925
"punpcklbw %%mm7, %%mm7 \n\t"
926
"punpcklwd %%mm7, %%mm7 \n\t"
927
"punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
929
"movq _mask24_0, %%mm0 \n\t"
930
"movq _mask24_1, %%mm1 \n\t"
931
"movq _mask24_2, %%mm2 \n\t"
933
"pand %%mm7, %%mm0 \n\t"
934
"pand %%mm7, %%mm1 \n\t"
935
"pand %%mm7, %%mm2 \n\t"
937
"pcmpeqb %%mm6, %%mm0 \n\t"
938
"pcmpeqb %%mm6, %%mm1 \n\t"
939
"pcmpeqb %%mm6, %%mm2 \n\t"
941
// preload "movl len, %%ecx \n\t" // load length of line
942
// preload "movl srcptr, %%esi \n\t" // load source
943
// preload "movl dstptr, %%edi \n\t" // load dest
945
"cmpl $0, %%ecx \n\t"
946
"jz mainloop24end \n\t"
949
"movq (%%esi), %%mm4 \n\t"
950
"pand %%mm0, %%mm4 \n\t"
951
"movq %%mm0, %%mm6 \n\t"
952
"movq (%%edi), %%mm7 \n\t"
953
"pandn %%mm7, %%mm6 \n\t"
954
"por %%mm6, %%mm4 \n\t"
955
"movq %%mm4, (%%edi) \n\t"
957
"movq 8(%%esi), %%mm5 \n\t"
958
"pand %%mm1, %%mm5 \n\t"
959
"movq %%mm1, %%mm7 \n\t"
960
"movq 8(%%edi), %%mm6 \n\t"
961
"pandn %%mm6, %%mm7 \n\t"
962
"por %%mm7, %%mm5 \n\t"
963
"movq %%mm5, 8(%%edi) \n\t"
965
"movq 16(%%esi), %%mm6 \n\t"
966
"pand %%mm2, %%mm6 \n\t"
967
"movq %%mm2, %%mm4 \n\t"
968
"movq 16(%%edi), %%mm7 \n\t"
969
"pandn %%mm7, %%mm4 \n\t"
970
"por %%mm4, %%mm6 \n\t"
971
"movq %%mm6, 16(%%edi) \n\t"
973
"addl $24, %%esi \n\t" // inc by 24 bytes processed
974
"addl $24, %%edi \n\t"
975
"subl $8, %%ecx \n\t" // dec by 8 pixels processed
979
"mainloop24end: \n\t"
980
// preload "movl diff, %%ecx \n\t" // (diff is in eax)
981
"movl %%eax, %%ecx \n\t"
982
"cmpl $0, %%ecx \n\t"
984
// preload "movl mask, %%edx \n\t"
985
"sall $24, %%edx \n\t" // make low byte, high byte
988
"sall %%edx \n\t" // move high bit to CF
989
"jnc skip24 \n\t" // if CF = 0
990
"movw (%%esi), %%ax \n\t"
991
"movw %%ax, (%%edi) \n\t"
992
"xorl %%eax, %%eax \n\t"
993
"movb 2(%%esi), %%al \n\t"
994
"movb %%al, 2(%%edi) \n\t"
997
"addl $3, %%esi \n\t"
998
"addl $3, %%edi \n\t"
1000
"jnz secondloop24 \n\t"
1005
: "=a" (dummy_value_a), // output regs (dummy)
1006
"=d" (dummy_value_d),
1007
"=c" (dummy_value_c),
1008
"=S" (dummy_value_S),
1009
"=D" (dummy_value_D)
1011
: "3" (srcptr), // esi // input regs
1012
"4" (dstptr), // edi
1014
// was (unmask) "b" RESERVED // ebx // Global Offset Table idx
1018
#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
1019
: "%mm0", "%mm1", "%mm2" // clobber list
1020
, "%mm4", "%mm5", "%mm6", "%mm7"
1024
else /* mmx _not supported - Use modified C routine */
1025
#endif /* PNG_MMX_CODE_SUPPORTED */
1027
register png_uint_32 i;
1028
png_uint_32 initial_val = BPP3 * png_pass_start[png_ptr->pass];
1029
/* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1030
register int stride = BPP3 * png_pass_inc[png_ptr->pass];
1031
/* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1032
register int rep_bytes = BPP3 * png_pass_width[png_ptr->pass];
1033
/* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1034
png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
1035
int diff = (int) (png_ptr->width & 7); /* amount lost */
1036
register png_uint_32 final_val = BPP3 * len; /* GRR bugfix */
1038
srcptr = png_ptr->row_buf + 1 + initial_val;
1039
dstptr = row + initial_val;
1041
for (i = initial_val; i < final_val; i += stride)
1043
png_memcpy(dstptr, srcptr, rep_bytes);
1047
if (diff) /* number of leftover pixels: 3 for pngtest */
1049
final_val+=diff*BPP3;
1050
for (; i < final_val; i += stride)
1052
if (rep_bytes > (int)(final_val-i))
1053
rep_bytes = (int)(final_val-i);
1054
png_memcpy(dstptr, srcptr, rep_bytes);
1059
} /* end of else (_mmx_supported) */
1064
case 32: /* png_ptr->row_info.pixel_depth */
1069
#if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
1070
#if !defined(PNG_1_0_X)
1071
if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
1072
/* && _mmx_supported */ )
1079
int dummy_value_a; // fix 'forbidden register spilled' error
1084
_unmask = ~mask; // global variable for -fPIC version
1085
srcptr = png_ptr->row_buf + 1;
1087
len = png_ptr->width &~7; // reduce to multiple of 8
1088
diff = (int) (png_ptr->width & 7); // amount lost //
1090
__asm__ __volatile__ (
1091
"movd _unmask, %%mm7 \n\t" // load bit pattern
1092
"psubb %%mm6, %%mm6 \n\t" // zero mm6
1093
"punpcklbw %%mm7, %%mm7 \n\t"
1094
"punpcklwd %%mm7, %%mm7 \n\t"
1095
"punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
1097
"movq _mask32_0, %%mm0 \n\t"
1098
"movq _mask32_1, %%mm1 \n\t"
1099
"movq _mask32_2, %%mm2 \n\t"
1100
"movq _mask32_3, %%mm3 \n\t"
1102
"pand %%mm7, %%mm0 \n\t"
1103
"pand %%mm7, %%mm1 \n\t"
1104
"pand %%mm7, %%mm2 \n\t"
1105
"pand %%mm7, %%mm3 \n\t"
1107
"pcmpeqb %%mm6, %%mm0 \n\t"
1108
"pcmpeqb %%mm6, %%mm1 \n\t"
1109
"pcmpeqb %%mm6, %%mm2 \n\t"
1110
"pcmpeqb %%mm6, %%mm3 \n\t"
1112
// preload "movl len, %%ecx \n\t" // load length of line
1113
// preload "movl srcptr, %%esi \n\t" // load source
1114
// preload "movl dstptr, %%edi \n\t" // load dest
1116
"cmpl $0, %%ecx \n\t" // lcr
1117
"jz mainloop32end \n\t"
1120
"movq (%%esi), %%mm4 \n\t"
1121
"pand %%mm0, %%mm4 \n\t"
1122
"movq %%mm0, %%mm6 \n\t"
1123
"movq (%%edi), %%mm7 \n\t"
1124
"pandn %%mm7, %%mm6 \n\t"
1125
"por %%mm6, %%mm4 \n\t"
1126
"movq %%mm4, (%%edi) \n\t"
1128
"movq 8(%%esi), %%mm5 \n\t"
1129
"pand %%mm1, %%mm5 \n\t"
1130
"movq %%mm1, %%mm7 \n\t"
1131
"movq 8(%%edi), %%mm6 \n\t"
1132
"pandn %%mm6, %%mm7 \n\t"
1133
"por %%mm7, %%mm5 \n\t"
1134
"movq %%mm5, 8(%%edi) \n\t"
1136
"movq 16(%%esi), %%mm6 \n\t"
1137
"pand %%mm2, %%mm6 \n\t"
1138
"movq %%mm2, %%mm4 \n\t"
1139
"movq 16(%%edi), %%mm7 \n\t"
1140
"pandn %%mm7, %%mm4 \n\t"
1141
"por %%mm4, %%mm6 \n\t"
1142
"movq %%mm6, 16(%%edi) \n\t"
1144
"movq 24(%%esi), %%mm7 \n\t"
1145
"pand %%mm3, %%mm7 \n\t"
1146
"movq %%mm3, %%mm5 \n\t"
1147
"movq 24(%%edi), %%mm4 \n\t"
1148
"pandn %%mm4, %%mm5 \n\t"
1149
"por %%mm5, %%mm7 \n\t"
1150
"movq %%mm7, 24(%%edi) \n\t"
1152
"addl $32, %%esi \n\t" // inc by 32 bytes processed
1153
"addl $32, %%edi \n\t"
1154
"subl $8, %%ecx \n\t" // dec by 8 pixels processed
1155
"ja mainloop32 \n\t"
1157
"mainloop32end: \n\t"
1158
// preload "movl diff, %%ecx \n\t" // (diff is in eax)
1159
"movl %%eax, %%ecx \n\t"
1160
"cmpl $0, %%ecx \n\t"
1162
// preload "movl mask, %%edx \n\t"
1163
"sall $24, %%edx \n\t" // low byte => high byte
1165
"secondloop32: \n\t"
1166
"sall %%edx \n\t" // move high bit to CF
1167
"jnc skip32 \n\t" // if CF = 0
1168
"movl (%%esi), %%eax \n\t"
1169
"movl %%eax, (%%edi) \n\t"
1172
"addl $4, %%esi \n\t"
1173
"addl $4, %%edi \n\t"
1175
"jnz secondloop32 \n\t"
1180
: "=a" (dummy_value_a), // output regs (dummy)
1181
"=d" (dummy_value_d),
1182
"=c" (dummy_value_c),
1183
"=S" (dummy_value_S),
1184
"=D" (dummy_value_D)
1186
: "3" (srcptr), // esi // input regs
1187
"4" (dstptr), // edi
1189
// was (unmask) "b" RESERVED // ebx // Global Offset Table idx
1193
#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
1194
: "%mm0", "%mm1", "%mm2", "%mm3" // clobber list
1195
, "%mm4", "%mm5", "%mm6", "%mm7"
1199
else /* mmx _not supported - Use modified C routine */
1200
#endif /* PNG_MMX_CODE_SUPPORTED */
1202
register png_uint_32 i;
1203
png_uint_32 initial_val = BPP4 * png_pass_start[png_ptr->pass];
1204
/* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1205
register int stride = BPP4 * png_pass_inc[png_ptr->pass];
1206
/* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1207
register int rep_bytes = BPP4 * png_pass_width[png_ptr->pass];
1208
/* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1209
png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
1210
int diff = (int) (png_ptr->width & 7); /* amount lost */
1211
register png_uint_32 final_val = BPP4 * len; /* GRR bugfix */
1213
srcptr = png_ptr->row_buf + 1 + initial_val;
1214
dstptr = row + initial_val;
1216
for (i = initial_val; i < final_val; i += stride)
1218
png_memcpy(dstptr, srcptr, rep_bytes);
1222
if (diff) /* number of leftover pixels: 3 for pngtest */
1224
final_val+=diff*BPP4;
1225
for (; i < final_val; i += stride)
1227
if (rep_bytes > (int)(final_val-i))
1228
rep_bytes = (int)(final_val-i);
1229
png_memcpy(dstptr, srcptr, rep_bytes);
1234
} /* end of else (_mmx_supported) */
1239
case 48: /* png_ptr->row_info.pixel_depth */
1244
#if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
1245
#if !defined(PNG_1_0_X)
1246
if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
1247
/* && _mmx_supported */ )
1254
int dummy_value_a; // fix 'forbidden register spilled' error
1259
_unmask = ~mask; // global variable for -fPIC version
1260
srcptr = png_ptr->row_buf + 1;
1262
len = png_ptr->width &~7; // reduce to multiple of 8
1263
diff = (int) (png_ptr->width & 7); // amount lost //
1265
__asm__ __volatile__ (
1266
"movd _unmask, %%mm7 \n\t" // load bit pattern
1267
"psubb %%mm6, %%mm6 \n\t" // zero mm6
1268
"punpcklbw %%mm7, %%mm7 \n\t"
1269
"punpcklwd %%mm7, %%mm7 \n\t"
1270
"punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
1272
"movq _mask48_0, %%mm0 \n\t"
1273
"movq _mask48_1, %%mm1 \n\t"
1274
"movq _mask48_2, %%mm2 \n\t"
1275
"movq _mask48_3, %%mm3 \n\t"
1276
"movq _mask48_4, %%mm4 \n\t"
1277
"movq _mask48_5, %%mm5 \n\t"
1279
"pand %%mm7, %%mm0 \n\t"
1280
"pand %%mm7, %%mm1 \n\t"
1281
"pand %%mm7, %%mm2 \n\t"
1282
"pand %%mm7, %%mm3 \n\t"
1283
"pand %%mm7, %%mm4 \n\t"
1284
"pand %%mm7, %%mm5 \n\t"
1286
"pcmpeqb %%mm6, %%mm0 \n\t"
1287
"pcmpeqb %%mm6, %%mm1 \n\t"
1288
"pcmpeqb %%mm6, %%mm2 \n\t"
1289
"pcmpeqb %%mm6, %%mm3 \n\t"
1290
"pcmpeqb %%mm6, %%mm4 \n\t"
1291
"pcmpeqb %%mm6, %%mm5 \n\t"
1293
// preload "movl len, %%ecx \n\t" // load length of line
1294
// preload "movl srcptr, %%esi \n\t" // load source
1295
// preload "movl dstptr, %%edi \n\t" // load dest
1297
"cmpl $0, %%ecx \n\t"
1298
"jz mainloop48end \n\t"
1301
"movq (%%esi), %%mm7 \n\t"
1302
"pand %%mm0, %%mm7 \n\t"
1303
"movq %%mm0, %%mm6 \n\t"
1304
"pandn (%%edi), %%mm6 \n\t"
1305
"por %%mm6, %%mm7 \n\t"
1306
"movq %%mm7, (%%edi) \n\t"
1308
"movq 8(%%esi), %%mm6 \n\t"
1309
"pand %%mm1, %%mm6 \n\t"
1310
"movq %%mm1, %%mm7 \n\t"
1311
"pandn 8(%%edi), %%mm7 \n\t"
1312
"por %%mm7, %%mm6 \n\t"
1313
"movq %%mm6, 8(%%edi) \n\t"
1315
"movq 16(%%esi), %%mm6 \n\t"
1316
"pand %%mm2, %%mm6 \n\t"
1317
"movq %%mm2, %%mm7 \n\t"
1318
"pandn 16(%%edi), %%mm7 \n\t"
1319
"por %%mm7, %%mm6 \n\t"
1320
"movq %%mm6, 16(%%edi) \n\t"
1322
"movq 24(%%esi), %%mm7 \n\t"
1323
"pand %%mm3, %%mm7 \n\t"
1324
"movq %%mm3, %%mm6 \n\t"
1325
"pandn 24(%%edi), %%mm6 \n\t"
1326
"por %%mm6, %%mm7 \n\t"
1327
"movq %%mm7, 24(%%edi) \n\t"
1329
"movq 32(%%esi), %%mm6 \n\t"
1330
"pand %%mm4, %%mm6 \n\t"
1331
"movq %%mm4, %%mm7 \n\t"
1332
"pandn 32(%%edi), %%mm7 \n\t"
1333
"por %%mm7, %%mm6 \n\t"
1334
"movq %%mm6, 32(%%edi) \n\t"
1336
"movq 40(%%esi), %%mm7 \n\t"
1337
"pand %%mm5, %%mm7 \n\t"
1338
"movq %%mm5, %%mm6 \n\t"
1339
"pandn 40(%%edi), %%mm6 \n\t"
1340
"por %%mm6, %%mm7 \n\t"
1341
"movq %%mm7, 40(%%edi) \n\t"
1343
"addl $48, %%esi \n\t" // inc by 48 bytes processed
1344
"addl $48, %%edi \n\t"
1345
"subl $8, %%ecx \n\t" // dec by 8 pixels processed
1347
"ja mainloop48 \n\t"
1349
"mainloop48end: \n\t"
1350
// preload "movl diff, %%ecx \n\t" // (diff is in eax)
1351
"movl %%eax, %%ecx \n\t"
1352
"cmpl $0, %%ecx \n\t"
1354
// preload "movl mask, %%edx \n\t"
1355
"sall $24, %%edx \n\t" // make low byte, high byte
1357
"secondloop48: \n\t"
1358
"sall %%edx \n\t" // move high bit to CF
1359
"jnc skip48 \n\t" // if CF = 0
1360
"movl (%%esi), %%eax \n\t"
1361
"movl %%eax, (%%edi) \n\t"
1364
"addl $4, %%esi \n\t"
1365
"addl $4, %%edi \n\t"
1367
"jnz secondloop48 \n\t"
1372
: "=a" (dummy_value_a), // output regs (dummy)
1373
"=d" (dummy_value_d),
1374
"=c" (dummy_value_c),
1375
"=S" (dummy_value_S),
1376
"=D" (dummy_value_D)
1378
: "3" (srcptr), // esi // input regs
1379
"4" (dstptr), // edi
1381
// was (unmask) "b" RESERVED // ebx // Global Offset Table idx
1385
#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
1386
: "%mm0", "%mm1", "%mm2", "%mm3" // clobber list
1387
, "%mm4", "%mm5", "%mm6", "%mm7"
1391
else /* mmx _not supported - Use modified C routine */
1392
#endif /* PNG_MMX_CODE_SUPPORTED */
1394
register png_uint_32 i;
1395
png_uint_32 initial_val = BPP6 * png_pass_start[png_ptr->pass];
1396
/* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1397
register int stride = BPP6 * png_pass_inc[png_ptr->pass];
1398
/* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1399
register int rep_bytes = BPP6 * png_pass_width[png_ptr->pass];
1400
/* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1401
png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
1402
int diff = (int) (png_ptr->width & 7); /* amount lost */
1403
register png_uint_32 final_val = BPP6 * len; /* GRR bugfix */
1405
srcptr = png_ptr->row_buf + 1 + initial_val;
1406
dstptr = row + initial_val;
1408
for (i = initial_val; i < final_val; i += stride)
1410
png_memcpy(dstptr, srcptr, rep_bytes);
1414
if (diff) /* number of leftover pixels: 3 for pngtest */
1416
final_val+=diff*BPP6;
1417
for (; i < final_val; i += stride)
1419
if (rep_bytes > (int)(final_val-i))
1420
rep_bytes = (int)(final_val-i);
1421
png_memcpy(dstptr, srcptr, rep_bytes);
1426
} /* end of else (_mmx_supported) */
1431
case 64: /* png_ptr->row_info.pixel_depth */
1435
register png_uint_32 i;
1436
png_uint_32 initial_val = BPP8 * png_pass_start[png_ptr->pass];
1437
/* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1438
register int stride = BPP8 * png_pass_inc[png_ptr->pass];
1439
/* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1440
register int rep_bytes = BPP8 * png_pass_width[png_ptr->pass];
1441
/* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1442
png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
1443
int diff = (int) (png_ptr->width & 7); /* amount lost */
1444
register png_uint_32 final_val = BPP8 * len; /* GRR bugfix */
1446
srcptr = png_ptr->row_buf + 1 + initial_val;
1447
dstptr = row + initial_val;
1449
for (i = initial_val; i < final_val; i += stride)
1451
png_memcpy(dstptr, srcptr, rep_bytes);
1455
if (diff) /* number of leftover pixels: 3 for pngtest */
1457
final_val+=diff*BPP8;
1458
for (; i < final_val; i += stride)
1460
if (rep_bytes > (int)(final_val-i))
1461
rep_bytes = (int)(final_val-i);
1462
png_memcpy(dstptr, srcptr, rep_bytes);
1471
default: /* png_ptr->row_info.pixel_depth != 1,2,4,8,16,24,32,48,64 */
1473
/* this should never happen */
1474
png_warning(png_ptr, "Invalid row_info.pixel_depth in pnggccrd");
1477
} /* end switch (png_ptr->row_info.pixel_depth) */
1479
} /* end if (non-trivial mask) */
1481
} /* end png_combine_row() */
1483
#endif /* PNG_HAVE_MMX_COMBINE_ROW */
1488
/*===========================================================================*/
1490
/* P N G _ D O _ R E A D _ I N T E R L A C E */
1492
/*===========================================================================*/
1494
#if defined(PNG_READ_INTERLACING_SUPPORTED)
1495
#if defined(PNG_HAVE_MMX_READ_INTERLACE)
1497
/* png_do_read_interlace() is called after any 16-bit to 8-bit conversion
1498
* has taken place. [GRR: what other steps come before and/or after?]
1502
png_do_read_interlace(png_structp png_ptr)
1504
png_row_infop row_info = &(png_ptr->row_info);
1505
png_bytep row = png_ptr->row_buf + 1;
1506
int pass = png_ptr->pass;
1507
#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1508
png_uint_32 transformations = png_ptr->transformations;
1511
png_debug(1, "in png_do_read_interlace (pnggccrd.c)\n");
1513
#if defined(PNG_MMX_CODE_SUPPORTED)
1514
if (_mmx_supported == 2) {
1515
#if !defined(PNG_1_0_X)
1516
/* this should have happened in png_init_mmx_flags() already */
1517
png_warning(png_ptr, "asm_flags may not have been initialized");
1523
if (row != NULL && row_info != NULL)
1525
png_uint_32 final_width;
1527
final_width = row_info->width * png_pass_inc[pass];
1529
switch (row_info->pixel_depth)
1535
int s_start, s_end, s_inc;
1540
sp = row + (png_size_t)((row_info->width - 1) >> 3);
1541
dp = row + (png_size_t)((final_width - 1) >> 3);
1542
#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1543
if (transformations & PNG_PACKSWAP)
1545
sshift = (int)((row_info->width + 7) & 7);
1546
dshift = (int)((final_width + 7) & 7);
1554
sshift = 7 - (int)((row_info->width + 7) & 7);
1555
dshift = 7 - (int)((final_width + 7) & 7);
1561
for (i = row_info->width; i; i--)
1563
v = (png_byte)((*sp >> sshift) & 0x1);
1564
for (j = 0; j < png_pass_inc[pass]; j++)
1566
*dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
1567
*dp |= (png_byte)(v << dshift);
1568
if (dshift == s_end)
1576
if (sshift == s_end)
1591
int s_start, s_end, s_inc;
1594
sp = row + (png_size_t)((row_info->width - 1) >> 2);
1595
dp = row + (png_size_t)((final_width - 1) >> 2);
1596
#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1597
if (transformations & PNG_PACKSWAP)
1599
sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
1600
dshift = (png_size_t)(((final_width + 3) & 3) << 1);
1608
sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
1609
dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
1615
for (i = row_info->width; i; i--)
1620
v = (png_byte)((*sp >> sshift) & 0x3);
1621
for (j = 0; j < png_pass_inc[pass]; j++)
1623
*dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
1624
*dp |= (png_byte)(v << dshift);
1625
if (dshift == s_end)
1633
if (sshift == s_end)
1648
int s_start, s_end, s_inc;
1651
sp = row + (png_size_t)((row_info->width - 1) >> 1);
1652
dp = row + (png_size_t)((final_width - 1) >> 1);
1653
#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1654
if (transformations & PNG_PACKSWAP)
1656
sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
1657
dshift = (png_size_t)(((final_width + 1) & 1) << 2);
1665
sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
1666
dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
1672
for (i = row_info->width; i; i--)
1677
v = (png_byte)((*sp >> sshift) & 0xf);
1678
for (j = 0; j < png_pass_inc[pass]; j++)
1680
*dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
1681
*dp |= (png_byte)(v << dshift);
1682
if (dshift == s_end)
1690
if (sshift == s_end)
1701
/*====================================================================*/
1703
default: /* 8-bit or larger (this is where the routine is modified) */
1706
// static unsigned long long _const4 = 0x0000000000FFFFFFLL; no good
1707
// static unsigned long long const4 = 0x0000000000FFFFFFLL; no good
1708
// unsigned long long _const4 = 0x0000000000FFFFFFLL; no good
1709
// unsigned long long const4 = 0x0000000000FFFFFFLL; no good
1713
png_size_t pixel_bytes;
1714
int width = (int)row_info->width;
1716
pixel_bytes = (row_info->pixel_depth >> 3);
1718
/* point sptr at the last pixel in the pre-expanded row: */
1719
sptr = row + (width - 1) * pixel_bytes;
1721
/* point dp at the last pixel position in the expanded row: */
1722
dp = row + (final_width - 1) * pixel_bytes;
1724
/* New code by Nirav Chhatrapati - Intel Corporation */
1726
#if defined(PNG_MMX_CODE_SUPPORTED)
1727
#if !defined(PNG_1_0_X)
1728
if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE)
1729
/* && _mmx_supported */ )
1734
//--------------------------------------------------------------
1735
if (pixel_bytes == 3)
1737
if (((pass == 0) || (pass == 1)) && width)
1739
int dummy_value_c; // fix 'forbidden register spilled'
1744
__asm__ __volatile__ (
1745
"subl $21, %%edi \n\t"
1746
// (png_pass_inc[pass] - 1)*pixel_bytes
1748
".loop3_pass0: \n\t"
1749
"movd (%%esi), %%mm0 \n\t" // x x x x x 2 1 0
1750
"pand (%3), %%mm0 \n\t" // z z z z z 2 1 0
1751
"movq %%mm0, %%mm1 \n\t" // z z z z z 2 1 0
1752
"psllq $16, %%mm0 \n\t" // z z z 2 1 0 z z
1753
"movq %%mm0, %%mm2 \n\t" // z z z 2 1 0 z z
1754
"psllq $24, %%mm0 \n\t" // 2 1 0 z z z z z
1755
"psrlq $8, %%mm1 \n\t" // z z z z z z 2 1
1756
"por %%mm2, %%mm0 \n\t" // 2 1 0 2 1 0 z z
1757
"por %%mm1, %%mm0 \n\t" // 2 1 0 2 1 0 2 1
1758
"movq %%mm0, %%mm3 \n\t" // 2 1 0 2 1 0 2 1
1759
"psllq $16, %%mm0 \n\t" // 0 2 1 0 2 1 z z
1760
"movq %%mm3, %%mm4 \n\t" // 2 1 0 2 1 0 2 1
1761
"punpckhdq %%mm0, %%mm3 \n\t" // 0 2 1 0 2 1 0 2
1762
"movq %%mm4, 16(%%edi) \n\t"
1763
"psrlq $32, %%mm0 \n\t" // z z z z 0 2 1 0
1764
"movq %%mm3, 8(%%edi) \n\t"
1765
"punpckldq %%mm4, %%mm0 \n\t" // 1 0 2 1 0 2 1 0
1766
"subl $3, %%esi \n\t"
1767
"movq %%mm0, (%%edi) \n\t"
1768
"subl $24, %%edi \n\t"
1770
"jnz .loop3_pass0 \n\t"
1773
: "=c" (dummy_value_c), // output regs (dummy)
1774
"=S" (dummy_value_S),
1775
"=D" (dummy_value_D),
1776
"=a" (dummy_value_a)
1779
: "1" (sptr), // esi // input regs
1782
"3" (&_const4) // %1(?) (0x0000000000FFFFFFLL)
1784
#if 0 /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
1785
: "%mm0", "%mm1", "%mm2" // clobber list
1790
else if (((pass == 2) || (pass == 3)) && width)
1792
int dummy_value_c; // fix 'forbidden register spilled'
1797
__asm__ __volatile__ (
1798
"subl $9, %%edi \n\t"
1799
// (png_pass_inc[pass] - 1)*pixel_bytes
1801
".loop3_pass2: \n\t"
1802
"movd (%%esi), %%mm0 \n\t" // x x x x x 2 1 0
1803
"pand (%3), %%mm0 \n\t" // z z z z z 2 1 0
1804
"movq %%mm0, %%mm1 \n\t" // z z z z z 2 1 0
1805
"psllq $16, %%mm0 \n\t" // z z z 2 1 0 z z
1806
"movq %%mm0, %%mm2 \n\t" // z z z 2 1 0 z z
1807
"psllq $24, %%mm0 \n\t" // 2 1 0 z z z z z
1808
"psrlq $8, %%mm1 \n\t" // z z z z z z 2 1
1809
"por %%mm2, %%mm0 \n\t" // 2 1 0 2 1 0 z z
1810
"por %%mm1, %%mm0 \n\t" // 2 1 0 2 1 0 2 1
1811
"movq %%mm0, 4(%%edi) \n\t"
1812
"psrlq $16, %%mm0 \n\t" // z z 2 1 0 2 1 0
1813
"subl $3, %%esi \n\t"
1814
"movd %%mm0, (%%edi) \n\t"
1815
"subl $12, %%edi \n\t"
1817
"jnz .loop3_pass2 \n\t"
1820
: "=c" (dummy_value_c), // output regs (dummy)
1821
"=S" (dummy_value_S),
1822
"=D" (dummy_value_D),
1823
"=a" (dummy_value_a)
1825
: "1" (sptr), // esi // input regs
1828
"3" (&_const4) // (0x0000000000FFFFFFLL)
1830
#if 0 /* %mm0, ..., %mm2 not supported by gcc 2.7.2.3 or egcs 1.1 */
1831
: "%mm0", "%mm1", "%mm2" // clobber list
1835
else if (width) /* && ((pass == 4) || (pass == 5)) */
1837
int width_mmx = ((width >> 1) << 1) - 8; // GRR: huh?
1840
width -= width_mmx; // 8 or 9 pix, 24 or 27 bytes
1843
// png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
1844
// sptr points at last pixel in pre-expanded row
1845
// dp points at last pixel position in expanded row
1846
int dummy_value_c; // fix 'forbidden register spilled'
1852
__asm__ __volatile__ (
1853
"subl $3, %%esi \n\t"
1854
"subl $9, %%edi \n\t"
1855
// (png_pass_inc[pass] + 1)*pixel_bytes
1857
".loop3_pass4: \n\t"
1858
"movq (%%esi), %%mm0 \n\t" // x x 5 4 3 2 1 0
1859
"movq %%mm0, %%mm1 \n\t" // x x 5 4 3 2 1 0
1860
"movq %%mm0, %%mm2 \n\t" // x x 5 4 3 2 1 0
1861
"psllq $24, %%mm0 \n\t" // 4 3 2 1 0 z z z
1862
"pand (%3), %%mm1 \n\t" // z z z z z 2 1 0
1863
"psrlq $24, %%mm2 \n\t" // z z z x x 5 4 3
1864
"por %%mm1, %%mm0 \n\t" // 4 3 2 1 0 2 1 0
1865
"movq %%mm2, %%mm3 \n\t" // z z z x x 5 4 3
1866
"psllq $8, %%mm2 \n\t" // z z x x 5 4 3 z
1867
"movq %%mm0, (%%edi) \n\t"
1868
"psrlq $16, %%mm3 \n\t" // z z z z z x x 5
1869
"pand (%4), %%mm3 \n\t" // z z z z z z z 5
1870
"por %%mm3, %%mm2 \n\t" // z z x x 5 4 3 5
1871
"subl $6, %%esi \n\t"
1872
"movd %%mm2, 8(%%edi) \n\t"
1873
"subl $12, %%edi \n\t"
1874
"subl $2, %%ecx \n\t"
1875
"jnz .loop3_pass4 \n\t"
1878
: "=c" (dummy_value_c), // output regs (dummy)
1879
"=S" (dummy_value_S),
1880
"=D" (dummy_value_D),
1881
"=a" (dummy_value_a),
1882
"=d" (dummy_value_d)
1884
: "1" (sptr), // esi // input regs
1886
"0" (width_mmx), // ecx
1887
"3" (&_const4), // 0x0000000000FFFFFFLL
1888
"4" (&_const6) // 0x00000000000000FFLL
1890
#if 0 /* %mm0, ..., %mm3 not supported by gcc 2.7.2.3 or egcs 1.1 */
1891
: "%mm0", "%mm1" // clobber list
1897
sptr -= width_mmx*3;
1899
for (i = width; i; i--)
1904
png_memcpy(v, sptr, 3);
1905
for (j = 0; j < png_pass_inc[pass]; j++)
1907
png_memcpy(dp, v, 3);
1913
} /* end of pixel_bytes == 3 */
1915
//--------------------------------------------------------------
1916
else if (pixel_bytes == 1)
1918
if (((pass == 0) || (pass == 1)) && width)
1920
int width_mmx = ((width >> 2) << 2);
1921
width -= width_mmx; // 0-3 pixels => 0-3 bytes
1924
int dummy_value_c; // fix 'forbidden register spilled'
1928
__asm__ __volatile__ (
1929
"subl $3, %%esi \n\t"
1930
"subl $31, %%edi \n\t"
1932
".loop1_pass0: \n\t"
1933
"movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
1934
"movq %%mm0, %%mm1 \n\t" // x x x x 3 2 1 0
1935
"punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0
1936
"movq %%mm0, %%mm2 \n\t" // 3 3 2 2 1 1 0 0
1937
"punpcklwd %%mm0, %%mm0 \n\t" // 1 1 1 1 0 0 0 0
1938
"movq %%mm0, %%mm3 \n\t" // 1 1 1 1 0 0 0 0
1939
"punpckldq %%mm0, %%mm0 \n\t" // 0 0 0 0 0 0 0 0
1940
"punpckhdq %%mm3, %%mm3 \n\t" // 1 1 1 1 1 1 1 1
1941
"movq %%mm0, (%%edi) \n\t"
1942
"punpckhwd %%mm2, %%mm2 \n\t" // 3 3 3 3 2 2 2 2
1943
"movq %%mm3, 8(%%edi) \n\t"
1944
"movq %%mm2, %%mm4 \n\t" // 3 3 3 3 2 2 2 2
1945
"punpckldq %%mm2, %%mm2 \n\t" // 2 2 2 2 2 2 2 2
1946
"punpckhdq %%mm4, %%mm4 \n\t" // 3 3 3 3 3 3 3 3
1947
"movq %%mm2, 16(%%edi) \n\t"
1948
"subl $4, %%esi \n\t"
1949
"movq %%mm4, 24(%%edi) \n\t"
1950
"subl $32, %%edi \n\t"
1951
"subl $4, %%ecx \n\t"
1952
"jnz .loop1_pass0 \n\t"
1955
: "=c" (dummy_value_c), // output regs (dummy)
1956
"=S" (dummy_value_S),
1957
"=D" (dummy_value_D)
1959
: "1" (sptr), // esi // input regs
1961
"0" (width_mmx) // ecx
1963
#if 0 /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
1964
: "%mm0", "%mm1", "%mm2" // clobber list
1972
for (i = width; i; i--)
1976
/* I simplified this part in version 1.0.4e
1977
* here and in several other instances where
1978
* pixel_bytes == 1 -- GR-P
1983
* png_memcpy(v, sptr, pixel_bytes);
1984
* for (j = 0; j < png_pass_inc[pass]; j++)
1986
* png_memcpy(dp, v, pixel_bytes);
1987
* dp -= pixel_bytes;
1989
* sptr -= pixel_bytes;
1991
* Replacement code is in the next three lines:
1994
for (j = 0; j < png_pass_inc[pass]; j++)
2001
else if (((pass == 2) || (pass == 3)) && width)
2003
int width_mmx = ((width >> 2) << 2);
2004
width -= width_mmx; // 0-3 pixels => 0-3 bytes
2007
int dummy_value_c; // fix 'forbidden register spilled'
2011
__asm__ __volatile__ (
2012
"subl $3, %%esi \n\t"
2013
"subl $15, %%edi \n\t"
2015
".loop1_pass2: \n\t"
2016
"movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
2017
"punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0
2018
"movq %%mm0, %%mm1 \n\t" // 3 3 2 2 1 1 0 0
2019
"punpcklwd %%mm0, %%mm0 \n\t" // 1 1 1 1 0 0 0 0
2020
"punpckhwd %%mm1, %%mm1 \n\t" // 3 3 3 3 2 2 2 2
2021
"movq %%mm0, (%%edi) \n\t"
2022
"subl $4, %%esi \n\t"
2023
"movq %%mm1, 8(%%edi) \n\t"
2024
"subl $16, %%edi \n\t"
2025
"subl $4, %%ecx \n\t"
2026
"jnz .loop1_pass2 \n\t"
2029
: "=c" (dummy_value_c), // output regs (dummy)
2030
"=S" (dummy_value_S),
2031
"=D" (dummy_value_D)
2033
: "1" (sptr), // esi // input regs
2035
"0" (width_mmx) // ecx
2037
#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2038
: "%mm0", "%mm1" // clobber list
2045
for (i = width; i; i--)
2049
for (j = 0; j < png_pass_inc[pass]; j++)
2056
else if (width) /* && ((pass == 4) || (pass == 5)) */
2058
int width_mmx = ((width >> 3) << 3);
2059
width -= width_mmx; // 0-3 pixels => 0-3 bytes
2062
int dummy_value_c; // fix 'forbidden register spilled'
2066
__asm__ __volatile__ (
2067
"subl $7, %%esi \n\t"
2068
"subl $15, %%edi \n\t"
2070
".loop1_pass4: \n\t"
2071
"movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2072
"movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
2073
"punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0
2074
"punpckhbw %%mm1, %%mm1 \n\t" // 7 7 6 6 5 5 4 4
2075
"movq %%mm1, 8(%%edi) \n\t"
2076
"subl $8, %%esi \n\t"
2077
"movq %%mm0, (%%edi) \n\t"
2078
"subl $16, %%edi \n\t"
2079
"subl $8, %%ecx \n\t"
2080
"jnz .loop1_pass4 \n\t"
2083
: "=c" (dummy_value_c), // output regs (none)
2084
"=S" (dummy_value_S),
2085
"=D" (dummy_value_D)
2087
: "1" (sptr), // esi // input regs
2089
"0" (width_mmx) // ecx
2091
#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2092
: "%mm0", "%mm1" // clobber list
2099
for (i = width; i; i--)
2103
for (j = 0; j < png_pass_inc[pass]; j++)
2110
} /* end of pixel_bytes == 1 */
2112
//--------------------------------------------------------------
2113
else if (pixel_bytes == 2)
2115
if (((pass == 0) || (pass == 1)) && width)
2117
int width_mmx = ((width >> 1) << 1);
2118
width -= width_mmx; // 0,1 pixels => 0,2 bytes
2121
int dummy_value_c; // fix 'forbidden register spilled'
2125
__asm__ __volatile__ (
2126
"subl $2, %%esi \n\t"
2127
"subl $30, %%edi \n\t"
2129
".loop2_pass0: \n\t"
2130
"movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
2131
"punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0
2132
"movq %%mm0, %%mm1 \n\t" // 3 2 3 2 1 0 1 0
2133
"punpckldq %%mm0, %%mm0 \n\t" // 1 0 1 0 1 0 1 0
2134
"punpckhdq %%mm1, %%mm1 \n\t" // 3 2 3 2 3 2 3 2
2135
"movq %%mm0, (%%edi) \n\t"
2136
"movq %%mm0, 8(%%edi) \n\t"
2137
"movq %%mm1, 16(%%edi) \n\t"
2138
"subl $4, %%esi \n\t"
2139
"movq %%mm1, 24(%%edi) \n\t"
2140
"subl $32, %%edi \n\t"
2141
"subl $2, %%ecx \n\t"
2142
"jnz .loop2_pass0 \n\t"
2145
: "=c" (dummy_value_c), // output regs (dummy)
2146
"=S" (dummy_value_S),
2147
"=D" (dummy_value_D)
2149
: "1" (sptr), // esi // input regs
2151
"0" (width_mmx) // ecx
2153
#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2154
: "%mm0", "%mm1" // clobber list
2159
sptr -= (width_mmx*2 - 2); // sign fixed
2160
dp -= (width_mmx*16 - 2); // sign fixed
2161
for (i = width; i; i--)
2166
png_memcpy(v, sptr, 2);
2167
for (j = 0; j < png_pass_inc[pass]; j++)
2170
png_memcpy(dp, v, 2);
2174
else if (((pass == 2) || (pass == 3)) && width)
2176
int width_mmx = ((width >> 1) << 1) ;
2177
width -= width_mmx; // 0,1 pixels => 0,2 bytes
2180
int dummy_value_c; // fix 'forbidden register spilled'
2184
__asm__ __volatile__ (
2185
"subl $2, %%esi \n\t"
2186
"subl $14, %%edi \n\t"
2188
".loop2_pass2: \n\t"
2189
"movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
2190
"punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0
2191
"movq %%mm0, %%mm1 \n\t" // 3 2 3 2 1 0 1 0
2192
"punpckldq %%mm0, %%mm0 \n\t" // 1 0 1 0 1 0 1 0
2193
"punpckhdq %%mm1, %%mm1 \n\t" // 3 2 3 2 3 2 3 2
2194
"movq %%mm0, (%%edi) \n\t"
2195
"subl $4, %%esi \n\t"
2196
"movq %%mm1, 8(%%edi) \n\t"
2197
"subl $16, %%edi \n\t"
2198
"subl $2, %%ecx \n\t"
2199
"jnz .loop2_pass2 \n\t"
2202
: "=c" (dummy_value_c), // output regs (dummy)
2203
"=S" (dummy_value_S),
2204
"=D" (dummy_value_D)
2206
: "1" (sptr), // esi // input regs
2208
"0" (width_mmx) // ecx
2210
#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2211
: "%mm0", "%mm1" // clobber list
2216
sptr -= (width_mmx*2 - 2); // sign fixed
2217
dp -= (width_mmx*8 - 2); // sign fixed
2218
for (i = width; i; i--)
2223
png_memcpy(v, sptr, 2);
2224
for (j = 0; j < png_pass_inc[pass]; j++)
2227
png_memcpy(dp, v, 2);
2231
else if (width) // pass == 4 or 5
2233
int width_mmx = ((width >> 1) << 1) ;
2234
width -= width_mmx; // 0,1 pixels => 0,2 bytes
2237
int dummy_value_c; // fix 'forbidden register spilled'
2241
__asm__ __volatile__ (
2242
"subl $2, %%esi \n\t"
2243
"subl $6, %%edi \n\t"
2245
".loop2_pass4: \n\t"
2246
"movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
2247
"punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0
2248
"subl $4, %%esi \n\t"
2249
"movq %%mm0, (%%edi) \n\t"
2250
"subl $8, %%edi \n\t"
2251
"subl $2, %%ecx \n\t"
2252
"jnz .loop2_pass4 \n\t"
2255
: "=c" (dummy_value_c), // output regs (dummy)
2256
"=S" (dummy_value_S),
2257
"=D" (dummy_value_D)
2259
: "1" (sptr), // esi // input regs
2261
"0" (width_mmx) // ecx
2263
#if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2264
: "%mm0" // clobber list
2269
sptr -= (width_mmx*2 - 2); // sign fixed
2270
dp -= (width_mmx*4 - 2); // sign fixed
2271
for (i = width; i; i--)
2276
png_memcpy(v, sptr, 2);
2277
for (j = 0; j < png_pass_inc[pass]; j++)
2280
png_memcpy(dp, v, 2);
2284
} /* end of pixel_bytes == 2 */
2286
//--------------------------------------------------------------
2287
else if (pixel_bytes == 4)
2289
if (((pass == 0) || (pass == 1)) && width)
2291
int width_mmx = ((width >> 1) << 1);
2292
width -= width_mmx; // 0,1 pixels => 0,4 bytes
2295
int dummy_value_c; // fix 'forbidden register spilled'
2299
__asm__ __volatile__ (
2300
"subl $4, %%esi \n\t"
2301
"subl $60, %%edi \n\t"
2303
".loop4_pass0: \n\t"
2304
"movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2305
"movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
2306
"punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
2307
"punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
2308
"movq %%mm0, (%%edi) \n\t"
2309
"movq %%mm0, 8(%%edi) \n\t"
2310
"movq %%mm0, 16(%%edi) \n\t"
2311
"movq %%mm0, 24(%%edi) \n\t"
2312
"movq %%mm1, 32(%%edi) \n\t"
2313
"movq %%mm1, 40(%%edi) \n\t"
2314
"movq %%mm1, 48(%%edi) \n\t"
2315
"subl $8, %%esi \n\t"
2316
"movq %%mm1, 56(%%edi) \n\t"
2317
"subl $64, %%edi \n\t"
2318
"subl $2, %%ecx \n\t"
2319
"jnz .loop4_pass0 \n\t"
2322
: "=c" (dummy_value_c), // output regs (dummy)
2323
"=S" (dummy_value_S),
2324
"=D" (dummy_value_D)
2326
: "1" (sptr), // esi // input regs
2328
"0" (width_mmx) // ecx
2330
#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2331
: "%mm0", "%mm1" // clobber list
2336
sptr -= (width_mmx*4 - 4); // sign fixed
2337
dp -= (width_mmx*32 - 4); // sign fixed
2338
for (i = width; i; i--)
2343
png_memcpy(v, sptr, 4);
2344
for (j = 0; j < png_pass_inc[pass]; j++)
2347
png_memcpy(dp, v, 4);
2351
else if (((pass == 2) || (pass == 3)) && width)
2353
int width_mmx = ((width >> 1) << 1);
2354
width -= width_mmx; // 0,1 pixels => 0,4 bytes
2357
int dummy_value_c; // fix 'forbidden register spilled'
2361
__asm__ __volatile__ (
2362
"subl $4, %%esi \n\t"
2363
"subl $28, %%edi \n\t"
2365
".loop4_pass2: \n\t"
2366
"movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2367
"movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
2368
"punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
2369
"punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
2370
"movq %%mm0, (%%edi) \n\t"
2371
"movq %%mm0, 8(%%edi) \n\t"
2372
"movq %%mm1, 16(%%edi) \n\t"
2373
"movq %%mm1, 24(%%edi) \n\t"
2374
"subl $8, %%esi \n\t"
2375
"subl $32, %%edi \n\t"
2376
"subl $2, %%ecx \n\t"
2377
"jnz .loop4_pass2 \n\t"
2380
: "=c" (dummy_value_c), // output regs (dummy)
2381
"=S" (dummy_value_S),
2382
"=D" (dummy_value_D)
2384
: "1" (sptr), // esi // input regs
2386
"0" (width_mmx) // ecx
2388
#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2389
: "%mm0", "%mm1" // clobber list
2394
sptr -= (width_mmx*4 - 4); // sign fixed
2395
dp -= (width_mmx*16 - 4); // sign fixed
2396
for (i = width; i; i--)
2401
png_memcpy(v, sptr, 4);
2402
for (j = 0; j < png_pass_inc[pass]; j++)
2405
png_memcpy(dp, v, 4);
2409
else if (width) // pass == 4 or 5
2411
int width_mmx = ((width >> 1) << 1) ;
2412
width -= width_mmx; // 0,1 pixels => 0,4 bytes
2415
int dummy_value_c; // fix 'forbidden register spilled'
2419
__asm__ __volatile__ (
2420
"subl $4, %%esi \n\t"
2421
"subl $12, %%edi \n\t"
2423
".loop4_pass4: \n\t"
2424
"movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2425
"movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
2426
"punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
2427
"punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
2428
"movq %%mm0, (%%edi) \n\t"
2429
"subl $8, %%esi \n\t"
2430
"movq %%mm1, 8(%%edi) \n\t"
2431
"subl $16, %%edi \n\t"
2432
"subl $2, %%ecx \n\t"
2433
"jnz .loop4_pass4 \n\t"
2436
: "=c" (dummy_value_c), // output regs (dummy)
2437
"=S" (dummy_value_S),
2438
"=D" (dummy_value_D)
2440
: "1" (sptr), // esi // input regs
2442
"0" (width_mmx) // ecx
2444
#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2445
: "%mm0", "%mm1" // clobber list
2450
sptr -= (width_mmx*4 - 4); // sign fixed
2451
dp -= (width_mmx*8 - 4); // sign fixed
2452
for (i = width; i; i--)
2457
png_memcpy(v, sptr, 4);
2458
for (j = 0; j < png_pass_inc[pass]; j++)
2461
png_memcpy(dp, v, 4);
2465
} /* end of pixel_bytes == 4 */
2467
//--------------------------------------------------------------
2468
else if (pixel_bytes == 8)
2470
// GRR TEST: should work, but needs testing (special 64-bit version of rpng2?)
2471
// GRR NOTE: no need to combine passes here!
2472
if (((pass == 0) || (pass == 1)) && width)
2474
int dummy_value_c; // fix 'forbidden register spilled'
2478
// source is 8-byte RRGGBBAA
2479
// dest is 64-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA ...
2480
__asm__ __volatile__ (
2481
"subl $56, %%edi \n\t" // start of last block
2483
".loop8_pass0: \n\t"
2484
"movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2485
"movq %%mm0, (%%edi) \n\t"
2486
"movq %%mm0, 8(%%edi) \n\t"
2487
"movq %%mm0, 16(%%edi) \n\t"
2488
"movq %%mm0, 24(%%edi) \n\t"
2489
"movq %%mm0, 32(%%edi) \n\t"
2490
"movq %%mm0, 40(%%edi) \n\t"
2491
"movq %%mm0, 48(%%edi) \n\t"
2492
"subl $8, %%esi \n\t"
2493
"movq %%mm0, 56(%%edi) \n\t"
2494
"subl $64, %%edi \n\t"
2496
"jnz .loop8_pass0 \n\t"
2499
: "=c" (dummy_value_c), // output regs (dummy)
2500
"=S" (dummy_value_S),
2501
"=D" (dummy_value_D)
2503
: "1" (sptr), // esi // input regs
2507
#if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2508
: "%mm0" // clobber list
2512
else if (((pass == 2) || (pass == 3)) && width)
2514
// source is 8-byte RRGGBBAA
2515
// dest is 32-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA
2516
// (recall that expansion is _in place_: sptr and dp
2517
// both point at locations within same row buffer)
2519
int dummy_value_c; // fix 'forbidden register spilled'
2523
__asm__ __volatile__ (
2524
"subl $24, %%edi \n\t" // start of last block
2526
".loop8_pass2: \n\t"
2527
"movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2528
"movq %%mm0, (%%edi) \n\t"
2529
"movq %%mm0, 8(%%edi) \n\t"
2530
"movq %%mm0, 16(%%edi) \n\t"
2531
"subl $8, %%esi \n\t"
2532
"movq %%mm0, 24(%%edi) \n\t"
2533
"subl $32, %%edi \n\t"
2535
"jnz .loop8_pass2 \n\t"
2538
: "=c" (dummy_value_c), // output regs (dummy)
2539
"=S" (dummy_value_S),
2540
"=D" (dummy_value_D)
2542
: "1" (sptr), // esi // input regs
2546
#if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2547
: "%mm0" // clobber list
2552
else if (width) // pass == 4 or 5
2554
// source is 8-byte RRGGBBAA
2555
// dest is 16-byte RRGGBBAA RRGGBBAA
2557
int dummy_value_c; // fix 'forbidden register spilled'
2561
__asm__ __volatile__ (
2562
"subl $8, %%edi \n\t" // start of last block
2564
".loop8_pass4: \n\t"
2565
"movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2566
"movq %%mm0, (%%edi) \n\t"
2567
"subl $8, %%esi \n\t"
2568
"movq %%mm0, 8(%%edi) \n\t"
2569
"subl $16, %%edi \n\t"
2571
"jnz .loop8_pass4 \n\t"
2574
: "=c" (dummy_value_c), // output regs (dummy)
2575
"=S" (dummy_value_S),
2576
"=D" (dummy_value_D)
2578
: "1" (sptr), // esi // input regs
2582
#if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2583
: "%mm0" // clobber list
2589
} /* end of pixel_bytes == 8 */
2591
//--------------------------------------------------------------
2592
else if (pixel_bytes == 6)
2594
for (i = width; i; i--)
2598
png_memcpy(v, sptr, 6);
2599
for (j = 0; j < png_pass_inc[pass]; j++)
2601
png_memcpy(dp, v, 6);
2606
} /* end of pixel_bytes == 6 */
2608
//--------------------------------------------------------------
2611
for (i = width; i; i--)
2615
png_memcpy(v, sptr, pixel_bytes);
2616
for (j = 0; j < png_pass_inc[pass]; j++)
2618
png_memcpy(dp, v, pixel_bytes);
2624
} // end of _mmx_supported ========================================
2626
else /* MMX not supported: use modified C code - takes advantage
2627
* of inlining of png_memcpy for a constant */
2628
/* GRR 19991007: does it? or should pixel_bytes in each
2629
* block be replaced with immediate value (e.g., 1)? */
2630
/* GRR 19991017: replaced with constants in each case */
2631
#endif /* PNG_MMX_CODE_SUPPORTED */
2633
if (pixel_bytes == 1)
2635
for (i = width; i; i--)
2638
for (j = 0; j < png_pass_inc[pass]; j++)
2645
else if (pixel_bytes == 3)
2647
for (i = width; i; i--)
2651
png_memcpy(v, sptr, 3);
2652
for (j = 0; j < png_pass_inc[pass]; j++)
2654
png_memcpy(dp, v, 3);
2660
else if (pixel_bytes == 2)
2662
for (i = width; i; i--)
2666
png_memcpy(v, sptr, 2);
2667
for (j = 0; j < png_pass_inc[pass]; j++)
2669
png_memcpy(dp, v, 2);
2675
else if (pixel_bytes == 4)
2677
for (i = width; i; i--)
2681
png_memcpy(v, sptr, 4);
2682
for (j = 0; j < png_pass_inc[pass]; j++)
2685
if (dp < row || dp+3 > row+png_ptr->row_buf_size)
2687
printf("dp out of bounds: row=%d, dp=%d, rp=%d\n",
2688
row, dp, row+png_ptr->row_buf_size);
2689
printf("row_buf=%d\n",png_ptr->row_buf_size);
2692
png_memcpy(dp, v, 4);
2698
else if (pixel_bytes == 6)
2700
for (i = width; i; i--)
2704
png_memcpy(v, sptr, 6);
2705
for (j = 0; j < png_pass_inc[pass]; j++)
2707
png_memcpy(dp, v, 6);
2713
else if (pixel_bytes == 8)
2715
for (i = width; i; i--)
2719
png_memcpy(v, sptr, 8);
2720
for (j = 0; j < png_pass_inc[pass]; j++)
2722
png_memcpy(dp, v, 8);
2728
else /* GRR: should never be reached */
2730
for (i = width; i; i--)
2734
png_memcpy(v, sptr, pixel_bytes);
2735
for (j = 0; j < png_pass_inc[pass]; j++)
2737
png_memcpy(dp, v, pixel_bytes);
2740
sptr -= pixel_bytes;
2744
} /* end if (MMX not supported) */
2747
} /* end switch (row_info->pixel_depth) */
2749
row_info->width = final_width;
2751
row_info->rowbytes = PNG_ROWBYTES(row_info->pixel_depth,final_width);
2754
} /* end png_do_read_interlace() */
2756
#endif /* PNG_HAVE_MMX_READ_INTERLACE */
2757
#endif /* PNG_READ_INTERLACING_SUPPORTED */
2761
#if defined(PNG_HAVE_MMX_READ_FILTER_ROW)
2762
#if defined(PNG_MMX_CODE_SUPPORTED)
2764
// These variables are utilized in the functions below. They are declared
2765
// globally here to ensure alignment on 8-byte boundaries.
2770
} _LBCarryMask = {0x0101010101010101LL},
2771
_HBClearMask = {0x7f7f7f7f7f7f7f7fLL},
2772
_ActiveMask, _ActiveMask2, _ActiveMaskEnd, _ShiftBpp, _ShiftRem;
2774
#ifdef PNG_THREAD_UNSAFE_OK
2775
//===========================================================================//
2777
// P N G _ R E A D _ F I L T E R _ R O W _ M M X _ A V G //
2779
//===========================================================================//
2781
// Optimized code for PNG Average filter decoder
2783
static void /* PRIVATE */
2784
png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
2788
int dummy_value_c; // fix 'forbidden register 2 (cx) was spilled' error
2792
bpp = (row_info->pixel_depth + 7) >> 3; // get # bytes per pixel
2793
_FullLength = row_info->rowbytes; // # of bytes to filter
2795
__asm__ __volatile__ (
2796
// initialize address pointers and offset
2798
"pushl %%ebx \n\t" // save index to Global Offset Table
2800
//pre "movl row, %%edi \n\t" // edi: Avg(x)
2801
"xorl %%ebx, %%ebx \n\t" // ebx: x
2802
"movl %%edi, %%edx \n\t"
2803
//pre "movl prev_row, %%esi \n\t" // esi: Prior(x)
2804
//pre "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
2805
"subl %%ecx, %%edx \n\t" // edx: Raw(x-bpp)
2807
"xorl %%eax,%%eax \n\t"
2809
// Compute the Raw value for the first bpp bytes
2810
// Raw(x) = Avg(x) + (Prior(x)/2)
2812
"movb (%%esi,%%ebx,),%%al \n\t" // load al with Prior(x)
2814
"shrb %%al \n\t" // divide by 2
2815
"addb -1(%%edi,%%ebx,),%%al \n\t" // add Avg(x); -1 to offset inc ebx
2816
//pre "cmpl bpp, %%ebx \n\t" // (bpp is preloaded into ecx)
2817
"cmpl %%ecx, %%ebx \n\t"
2818
"movb %%al,-1(%%edi,%%ebx,) \n\t" // write Raw(x); -1 to offset inc ebx
2819
"jb avg_rlp \n\t" // mov does not affect flags
2821
// get # of bytes to alignment
2822
"movl %%edi, _dif \n\t" // take start of row
2823
"addl %%ebx, _dif \n\t" // add bpp
2824
"addl $0xf, _dif \n\t" // add 7+8 to incr past alignment bdry
2825
"andl $0xfffffff8, _dif \n\t" // mask to alignment boundary
2826
"subl %%edi, _dif \n\t" // subtract from start => value ebx at
2827
"jz avg_go \n\t" // alignment
2830
// Compute the Raw value for the bytes up to the alignment boundary
2831
// Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2832
"xorl %%ecx, %%ecx \n\t"
2835
"xorl %%eax, %%eax \n\t"
2836
"movb (%%esi,%%ebx,), %%cl \n\t" // load cl with Prior(x)
2837
"movb (%%edx,%%ebx,), %%al \n\t" // load al with Raw(x-bpp)
2838
"addw %%cx, %%ax \n\t"
2840
"shrw %%ax \n\t" // divide by 2
2841
"addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
2842
"cmpl _dif, %%ebx \n\t" // check if at alignment boundary
2843
"movb %%al, -1(%%edi,%%ebx,) \n\t" // write Raw(x); -1 to offset inc ebx
2844
"jb avg_lp1 \n\t" // repeat until at alignment boundary
2847
"movl _FullLength, %%eax \n\t"
2848
"movl %%eax, %%ecx \n\t"
2849
"subl %%ebx, %%eax \n\t" // subtract alignment fix
2850
"andl $0x00000007, %%eax \n\t" // calc bytes over mult of 8
2851
"subl %%eax, %%ecx \n\t" // drop over bytes from original length
2852
"movl %%ecx, _MMXLength \n\t"
2854
"popl %%ebx \n\t" // restore index to Global Offset Table
2857
: "=c" (dummy_value_c), // output regs (dummy)
2858
"=S" (dummy_value_S),
2859
"=D" (dummy_value_D)
2861
: "0" (bpp), // ecx // input regs
2862
"1" (prev_row), // esi
2865
: "%eax", "%edx" // clobber list
2869
// GRR: INCLUDE "memory" as clobbered? (_dif, _MMXLength)
2870
// (seems to work fine without...)
2873
// now do the math for the rest of the row
2878
_ActiveMask.use = 0x0000000000ffffffLL;
2879
_ShiftBpp.use = 24; // == 3 * 8
2880
_ShiftRem.use = 40; // == 64 - 24
2882
__asm__ __volatile__ (
2883
// re-init address pointers and offset
2884
"movq _ActiveMask, %%mm7 \n\t"
2885
"movl _dif, %%ecx \n\t" // ecx: x = offset to
2886
"movq _LBCarryMask, %%mm5 \n\t" // alignment boundary
2887
// preload "movl row, %%edi \n\t" // edi: Avg(x)
2888
"movq _HBClearMask, %%mm4 \n\t"
2889
// preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
2891
// prime the pump: load the first Raw(x-bpp) data set
2892
"movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
2893
// (correct pos. in loop below)
2895
"movq (%%edi,%%ecx,), %%mm0 \n\t" // load mm0 with Avg(x)
2896
"movq %%mm5, %%mm3 \n\t"
2897
"psrlq _ShiftRem, %%mm2 \n\t" // correct position Raw(x-bpp)
2899
"movq (%%esi,%%ecx,), %%mm1 \n\t" // load mm1 with Prior(x)
2900
"movq %%mm7, %%mm6 \n\t"
2901
"pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
2902
"psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
2903
"pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
2905
"paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
2907
// add 1st active group (Raw(x-bpp)/2) to average with LBCarry
2908
"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
2910
"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
2912
// lsb's were == 1 (only valid for active group)
2913
"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2914
"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
2916
"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
2918
"pand %%mm6, %%mm2 \n\t" // leave only Active Group 1
2919
// bytes to add to Avg
2920
"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
2921
// Avg for each Active
2923
// add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
2924
"psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover
2926
"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
2927
"psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
2928
"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
2930
"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
2932
// lsb's were == 1 (only valid for active group)
2933
"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2934
"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
2936
"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
2938
"pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
2939
// bytes to add to Avg
2940
"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
2941
// Avg for each Active
2944
// add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
2945
"psllq _ShiftBpp, %%mm6 \n\t" // shift mm6 mask to cover last
2948
"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
2949
"psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
2950
// Data only needs to be shifted once here to
2951
// get the correct x-bpp offset.
2952
"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
2954
"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
2956
// lsb's were == 1 (only valid for active group)
2957
"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2958
"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
2960
"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
2962
"pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
2963
// bytes to add to Avg
2964
"addl $8, %%ecx \n\t"
2965
"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
2966
// Avg for each Active
2968
// now ready to write back to memory
2969
"movq %%mm0, -8(%%edi,%%ecx,) \n\t"
2970
// move updated Raw(x) to use as Raw(x-bpp) for next loop
2971
"cmpl _MMXLength, %%ecx \n\t"
2972
"movq %%mm0, %%mm2 \n\t" // mov updated Raw(x) to mm2
2975
: "=S" (dummy_value_S), // output regs (dummy)
2976
"=D" (dummy_value_D)
2978
: "0" (prev_row), // esi // input regs
2981
: "%ecx" // clobber list
2982
#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
2983
, "%mm0", "%mm1", "%mm2", "%mm3"
2984
, "%mm4", "%mm5", "%mm6", "%mm7"
2992
//case 7: // who wrote this? PNG doesn't support 5 or 7 bytes/pixel
2993
//case 5: // GRR BOGUS
2995
_ActiveMask.use = 0xffffffffffffffffLL; // use shift below to clear
2996
// appropriate inactive bytes
2997
_ShiftBpp.use = bpp << 3;
2998
_ShiftRem.use = 64 - _ShiftBpp.use;
3000
__asm__ __volatile__ (
3001
"movq _HBClearMask, %%mm4 \n\t"
3003
// re-init address pointers and offset
3004
"movl _dif, %%ecx \n\t" // ecx: x = offset to
3005
// alignment boundary
3007
// load _ActiveMask and clear all bytes except for 1st active group
3008
"movq _ActiveMask, %%mm7 \n\t"
3009
// preload "movl row, %%edi \n\t" // edi: Avg(x)
3010
"psrlq _ShiftRem, %%mm7 \n\t"
3011
// preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
3012
"movq %%mm7, %%mm6 \n\t"
3013
"movq _LBCarryMask, %%mm5 \n\t"
3014
"psllq _ShiftBpp, %%mm6 \n\t" // create mask for 2nd active
3017
// prime the pump: load the first Raw(x-bpp) data set
3018
"movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
3019
// (we correct pos. in loop below)
3021
"movq (%%edi,%%ecx,), %%mm0 \n\t"
3022
"psrlq _ShiftRem, %%mm2 \n\t" // shift data to pos. correctly
3023
"movq (%%esi,%%ecx,), %%mm1 \n\t"
3024
// add (Prev_row/2) to average
3025
"movq %%mm5, %%mm3 \n\t"
3026
"pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
3027
"psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
3028
"pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
3030
"paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
3032
// add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
3033
"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
3035
"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
3037
// lsb's were == 1 (only valid for active group)
3038
"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3039
"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3041
"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3043
"pand %%mm7, %%mm2 \n\t" // leave only Active Group 1
3044
// bytes to add to Avg
3045
"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg
3048
// add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
3049
"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
3050
"psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
3051
"addl $8, %%ecx \n\t"
3052
"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
3054
"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
3056
// lsb's were == 1 (only valid for active group)
3057
"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3058
"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3060
"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3062
"pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
3063
// bytes to add to Avg
3064
"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
3065
// Avg for each Active
3067
"cmpl _MMXLength, %%ecx \n\t"
3068
// now ready to write back to memory
3069
"movq %%mm0, -8(%%edi,%%ecx,) \n\t"
3070
// prep Raw(x-bpp) for next loop
3071
"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
3074
: "=S" (dummy_value_S), // output regs (dummy)
3075
"=D" (dummy_value_D)
3077
: "0" (prev_row), // esi // input regs
3080
: "%ecx" // clobber list
3081
#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3082
, "%mm0", "%mm1", "%mm2", "%mm3"
3083
, "%mm4", "%mm5", "%mm6", "%mm7"
3087
break; // end 4,6 bpp
3091
_ActiveMask.use = 0x000000000000ffffLL;
3092
_ShiftBpp.use = 16; // == 2 * 8
3093
_ShiftRem.use = 48; // == 64 - 16
3095
__asm__ __volatile__ (
3097
"movq _ActiveMask, %%mm7 \n\t"
3098
// re-init address pointers and offset
3099
"movl _dif, %%ecx \n\t" // ecx: x = offset to alignment
3101
"movq _LBCarryMask, %%mm5 \n\t"
3102
// preload "movl row, %%edi \n\t" // edi: Avg(x)
3103
"movq _HBClearMask, %%mm4 \n\t"
3104
// preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
3106
// prime the pump: load the first Raw(x-bpp) data set
3107
"movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
3108
// (we correct pos. in loop below)
3110
"movq (%%edi,%%ecx,), %%mm0 \n\t"
3111
"psrlq _ShiftRem, %%mm2 \n\t" // shift data to pos. correctly
3112
"movq (%%esi,%%ecx,), %%mm1 \n\t" // (GRR BUGFIX: was psllq)
3113
// add (Prev_row/2) to average
3114
"movq %%mm5, %%mm3 \n\t"
3115
"pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
3116
"psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
3117
"pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
3119
"movq %%mm7, %%mm6 \n\t"
3120
"paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
3123
// add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
3124
"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
3126
"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
3128
// lsb's were == 1 (only valid
3129
// for active group)
3130
"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3131
"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3133
"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3135
"pand %%mm6, %%mm2 \n\t" // leave only Active Group 1
3136
// bytes to add to Avg
3137
"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg
3138
// for each Active byte
3140
// add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
3141
"psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover
3143
"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
3144
"psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
3145
"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
3147
"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
3149
// lsb's were == 1 (only valid
3150
// for active group)
3151
"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3152
"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3154
"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3156
"pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
3157
// bytes to add to Avg
3158
"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
3159
// Avg for each Active byte
3161
// add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
3162
"psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover
3164
"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
3165
"psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
3166
"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
3168
"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
3169
// where both lsb's were == 1
3170
// (only valid for active group)
3171
"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3172
"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3174
"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3176
"pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
3177
// bytes to add to Avg
3178
"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
3179
// Avg for each Active byte
3181
// add 4th active group (Raw(x-bpp)/2) to average with _LBCarry
3182
"psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover
3184
"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
3185
"psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
3186
"addl $8, %%ecx \n\t"
3187
"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
3189
"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
3191
// lsb's were == 1 (only valid
3192
// for active group)
3193
"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3194
"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3196
"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3198
"pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
3199
// bytes to add to Avg
3200
"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
3201
// Avg for each Active byte
3203
"cmpl _MMXLength, %%ecx \n\t"
3204
// now ready to write back to memory
3205
"movq %%mm0, -8(%%edi,%%ecx,) \n\t"
3206
// prep Raw(x-bpp) for next loop
3207
"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
3210
: "=S" (dummy_value_S), // output regs (dummy)
3211
"=D" (dummy_value_D)
3213
: "0" (prev_row), // esi // input regs
3216
: "%ecx" // clobber list
3217
#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3218
, "%mm0", "%mm1", "%mm2", "%mm3"
3219
, "%mm4", "%mm5", "%mm6", "%mm7"
3227
__asm__ __volatile__ (
3228
// re-init address pointers and offset
3230
"pushl %%ebx \n\t" // save Global Offset Table index
3232
"movl _dif, %%ebx \n\t" // ebx: x = offset to alignment
3234
// preload "movl row, %%edi \n\t" // edi: Avg(x)
3235
"cmpl _FullLength, %%ebx \n\t" // test if offset at end of array
3237
// do Paeth decode for remaining bytes
3238
// preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
3239
"movl %%edi, %%edx \n\t"
3240
// preload "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
3241
"subl %%ecx, %%edx \n\t" // edx: Raw(x-bpp)
3242
"xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx
3245
// Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
3246
"xorl %%eax, %%eax \n\t"
3247
"movb (%%esi,%%ebx,), %%cl \n\t" // load cl with Prior(x)
3248
"movb (%%edx,%%ebx,), %%al \n\t" // load al with Raw(x-bpp)
3249
"addw %%cx, %%ax \n\t"
3251
"shrw %%ax \n\t" // divide by 2
3252
"addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset
3254
"cmpl _FullLength, %%ebx \n\t" // check if at end of array
3255
"movb %%al, -1(%%edi,%%ebx,) \n\t" // write back Raw(x);
3256
// mov does not affect flags; -1 to offset inc ebx
3261
"popl %%ebx \n\t" // Global Offset Table index
3264
: "=c" (dummy_value_c), // output regs (dummy)
3265
"=S" (dummy_value_S),
3266
"=D" (dummy_value_D)
3268
: "0" (bpp), // ecx // input regs
3269
"1" (prev_row), // esi
3272
: "%eax", "%edx" // clobber list
3278
return; // end 1 bpp
3282
__asm__ __volatile__ (
3283
// re-init address pointers and offset
3284
"movl _dif, %%ecx \n\t" // ecx: x == offset to alignment
3285
"movq _LBCarryMask, %%mm5 \n\t" // boundary
3286
// preload "movl row, %%edi \n\t" // edi: Avg(x)
3287
"movq _HBClearMask, %%mm4 \n\t"
3288
// preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
3290
// prime the pump: load the first Raw(x-bpp) data set
3291
"movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
3292
// (NO NEED to correct pos. in loop below)
3295
"movq (%%edi,%%ecx,), %%mm0 \n\t"
3296
"movq %%mm5, %%mm3 \n\t"
3297
"movq (%%esi,%%ecx,), %%mm1 \n\t"
3298
"addl $8, %%ecx \n\t"
3299
"pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
3300
"psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
3301
"pand %%mm2, %%mm3 \n\t" // get LBCarrys for each byte
3302
// where both lsb's were == 1
3303
"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3304
"pand %%mm4, %%mm1 \n\t" // clear invalid bit 7, each byte
3305
"paddb %%mm3, %%mm0 \n\t" // add LBCarrys to Avg, each byte
3306
"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7, each byte
3307
"paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg, each
3308
"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) to Avg for each
3309
"cmpl _MMXLength, %%ecx \n\t"
3310
"movq %%mm0, -8(%%edi,%%ecx,) \n\t"
3311
"movq %%mm0, %%mm2 \n\t" // reuse as Raw(x-bpp)
3314
: "=S" (dummy_value_S), // output regs (dummy)
3315
"=D" (dummy_value_D)
3317
: "0" (prev_row), // esi // input regs
3320
: "%ecx" // clobber list
3321
#if 0 /* %mm0, ..., %mm5 not supported by gcc 2.7.2.3 or egcs 1.1 */
3322
, "%mm0", "%mm1", "%mm2"
3323
, "%mm3", "%mm4", "%mm5"
3329
default: // bpp greater than 8 (!= 1,2,3,4,[5],6,[7],8)
3333
// GRR: PRINT ERROR HERE: SHOULD NEVER BE REACHED
3335
"Internal logic error in pnggccrd (png_read_filter_row_mmx_avg())\n");
3339
__asm__ __volatile__ (
3340
"movq _LBCarryMask, %%mm5 \n\t"
3341
// re-init address pointers and offset
3342
"movl _dif, %%ebx \n\t" // ebx: x = offset to
3343
// alignment boundary
3344
"movl row, %%edi \n\t" // edi: Avg(x)
3345
"movq _HBClearMask, %%mm4 \n\t"
3346
"movl %%edi, %%edx \n\t"
3347
"movl prev_row, %%esi \n\t" // esi: Prior(x)
3348
"subl bpp, %%edx \n\t" // edx: Raw(x-bpp)
3350
"movq (%%edi,%%ebx,), %%mm0 \n\t"
3351
"movq %%mm5, %%mm3 \n\t"
3352
"movq (%%esi,%%ebx,), %%mm1 \n\t"
3353
"pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
3354
"movq (%%edx,%%ebx,), %%mm2 \n\t"
3355
"psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
3356
"pand %%mm2, %%mm3 \n\t" // get LBCarrys for each byte
3357
// where both lsb's were == 1
3358
"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3359
"pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
3361
"paddb %%mm3, %%mm0 \n\t" // add LBCarrys to Avg for each
3363
"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3365
"paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
3367
"addl $8, %%ebx \n\t"
3368
"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) to Avg for each
3370
"cmpl _MMXLength, %%ebx \n\t"
3371
"movq %%mm0, -8(%%edi,%%ebx,) \n\t"
3374
: // FIXASM: output regs/vars go here, e.g.: "=m" (memory_var)
3376
: // FIXASM: input regs, e.g.: "c" (count), "S" (src), "D" (dest)
3378
: "%ebx", "%edx", "%edi", "%esi" // CHECKASM: clobber list
3380
#endif /* 0 - NEVER REACHED */
3384
} // end switch (bpp)
3386
__asm__ __volatile__ (
3387
// MMX acceleration complete; now do clean-up
3388
// check if any remaining bytes left to decode
3390
"pushl %%ebx \n\t" // save index to Global Offset Table
3392
"movl _MMXLength, %%ebx \n\t" // ebx: x == offset bytes after MMX
3393
//pre "movl row, %%edi \n\t" // edi: Avg(x)
3394
"cmpl _FullLength, %%ebx \n\t" // test if offset at end of array
3397
// do Avg decode for remaining bytes
3398
//pre "movl prev_row, %%esi \n\t" // esi: Prior(x)
3399
"movl %%edi, %%edx \n\t"
3400
//pre "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
3401
"subl %%ecx, %%edx \n\t" // edx: Raw(x-bpp)
3402
"xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx below
3405
// Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
3406
"xorl %%eax, %%eax \n\t"
3407
"movb (%%esi,%%ebx,), %%cl \n\t" // load cl with Prior(x)
3408
"movb (%%edx,%%ebx,), %%al \n\t" // load al with Raw(x-bpp)
3409
"addw %%cx, %%ax \n\t"
3411
"shrw %%ax \n\t" // divide by 2
3412
"addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
3413
"cmpl _FullLength, %%ebx \n\t" // check if at end of array
3414
"movb %%al, -1(%%edi,%%ebx,) \n\t" // write back Raw(x) [mov does not
3415
"jb avg_lp2 \n\t" // affect flags; -1 to offset inc ebx]
3418
"EMMS \n\t" // end MMX; prep for poss. FP instrs.
3420
"popl %%ebx \n\t" // restore index to Global Offset Table
3423
: "=c" (dummy_value_c), // output regs (dummy)
3424
"=S" (dummy_value_S),
3425
"=D" (dummy_value_D)
3427
: "0" (bpp), // ecx // input regs
3428
"1" (prev_row), // esi
3431
: "%eax", "%edx" // clobber list
3437
} /* end png_read_filter_row_mmx_avg() */
3442
#ifdef PNG_THREAD_UNSAFE_OK
3443
//===========================================================================//
3445
// P N G _ R E A D _ F I L T E R _ R O W _ M M X _ P A E T H //
3447
//===========================================================================//
3449
// Optimized code for PNG Paeth filter decoder
3451
static void /* PRIVATE */
3452
png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
3456
int dummy_value_c; // fix 'forbidden register 2 (cx) was spilled' error
3460
bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
3461
_FullLength = row_info->rowbytes; // # of bytes to filter
3463
__asm__ __volatile__ (
3465
"pushl %%ebx \n\t" // save index to Global Offset Table
3467
"xorl %%ebx, %%ebx \n\t" // ebx: x offset
3468
//pre "movl row, %%edi \n\t"
3469
"xorl %%edx, %%edx \n\t" // edx: x-bpp offset
3470
//pre "movl prev_row, %%esi \n\t"
3471
"xorl %%eax, %%eax \n\t"
3473
// Compute the Raw value for the first bpp bytes
3474
// Note: the formula works out to be always
3475
// Paeth(x) = Raw(x) + Prior(x) where x < bpp
3477
"movb (%%edi,%%ebx,), %%al \n\t"
3478
"addb (%%esi,%%ebx,), %%al \n\t"
3480
//pre "cmpl bpp, %%ebx \n\t" (bpp is preloaded into ecx)
3481
"cmpl %%ecx, %%ebx \n\t"
3482
"movb %%al, -1(%%edi,%%ebx,) \n\t"
3484
// get # of bytes to alignment
3485
"movl %%edi, _dif \n\t" // take start of row
3486
"addl %%ebx, _dif \n\t" // add bpp
3487
"xorl %%ecx, %%ecx \n\t"
3488
"addl $0xf, _dif \n\t" // add 7 + 8 to incr past alignment
3490
"andl $0xfffffff8, _dif \n\t" // mask to alignment boundary
3491
"subl %%edi, _dif \n\t" // subtract from start ==> value ebx
3497
"xorl %%eax, %%eax \n\t"
3498
// pav = p - a = (a + b - c) - a = b - c
3499
"movb (%%esi,%%ebx,), %%al \n\t" // load Prior(x) into al
3500
"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
3501
"subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
3502
"movl %%eax, _patemp \n\t" // Save pav for later use
3503
"xorl %%eax, %%eax \n\t"
3504
// pbv = p - b = (a + b - c) - b = a - c
3505
"movb (%%edi,%%edx,), %%al \n\t" // load Raw(x-bpp) into al
3506
"subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
3507
"movl %%eax, %%ecx \n\t"
3508
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3509
"addl _patemp, %%eax \n\t" // pcv = pav + pbv
3511
"testl $0x80000000, %%eax \n\t"
3513
"negl %%eax \n\t" // reverse sign of neg values
3516
"movl %%eax, _pctemp \n\t" // save pc for later use
3518
"testl $0x80000000, %%ecx \n\t"
3520
"negl %%ecx \n\t" // reverse sign of neg values
3523
"movl %%ecx, _pbtemp \n\t" // save pb for later use
3525
"movl _patemp, %%eax \n\t"
3526
"testl $0x80000000, %%eax \n\t"
3528
"negl %%eax \n\t" // reverse sign of neg values
3531
"movl %%eax, _patemp \n\t" // save pa for later use
3533
"cmpl %%ecx, %%eax \n\t"
3534
"jna paeth_abb \n\t"
3535
// pa > pb; now test if pb <= pc
3536
"cmpl _pctemp, %%ecx \n\t"
3537
"jna paeth_bbc \n\t"
3538
// pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3539
"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
3540
"jmp paeth_paeth \n\t"
3543
// pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3544
"movb (%%esi,%%ebx,), %%cl \n\t" // load Prior(x) into cl
3545
"jmp paeth_paeth \n\t"
3548
// pa <= pb; now test if pa <= pc
3549
"cmpl _pctemp, %%eax \n\t"
3550
"jna paeth_abc \n\t"
3551
// pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3552
"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
3553
"jmp paeth_paeth \n\t"
3556
// pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3557
"movb (%%edi,%%edx,), %%cl \n\t" // load Raw(x-bpp) into cl
3562
// Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3563
"addb %%cl, -1(%%edi,%%ebx,) \n\t"
3564
"cmpl _dif, %%ebx \n\t"
3568
"movl _FullLength, %%ecx \n\t"
3569
"movl %%ecx, %%eax \n\t"
3570
"subl %%ebx, %%eax \n\t" // subtract alignment fix
3571
"andl $0x00000007, %%eax \n\t" // calc bytes over mult of 8
3572
"subl %%eax, %%ecx \n\t" // drop over bytes from original length
3573
"movl %%ecx, _MMXLength \n\t"
3575
"popl %%ebx \n\t" // restore index to Global Offset Table
3578
: "=c" (dummy_value_c), // output regs (dummy)
3579
"=S" (dummy_value_S),
3580
"=D" (dummy_value_D)
3582
: "0" (bpp), // ecx // input regs
3583
"1" (prev_row), // esi
3586
: "%eax", "%edx" // clobber list
3592
// now do the math for the rest of the row
3597
_ActiveMask.use = 0x0000000000ffffffLL;
3598
_ActiveMaskEnd.use = 0xffff000000000000LL;
3599
_ShiftBpp.use = 24; // == bpp(3) * 8
3600
_ShiftRem.use = 40; // == 64 - 24
3602
__asm__ __volatile__ (
3603
"movl _dif, %%ecx \n\t"
3604
// preload "movl row, %%edi \n\t"
3605
// preload "movl prev_row, %%esi \n\t"
3606
"pxor %%mm0, %%mm0 \n\t"
3607
// prime the pump: load the first Raw(x-bpp) data set
3608
"movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3610
"psrlq _ShiftRem, %%mm1 \n\t" // shift last 3 bytes to 1st
3612
"movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
3613
"punpcklbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
3614
"movq -8(%%esi,%%ecx,), %%mm3 \n\t" // prep c=Prior(x-bpp) bytes
3615
"punpcklbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
3616
"psrlq _ShiftRem, %%mm3 \n\t" // shift last 3 bytes to 1st
3618
// pav = p - a = (a + b - c) - a = b - c
3619
"movq %%mm2, %%mm4 \n\t"
3620
"punpcklbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3621
// pbv = p - b = (a + b - c) - b = a - c
3622
"movq %%mm1, %%mm5 \n\t"
3623
"psubw %%mm3, %%mm4 \n\t"
3624
"pxor %%mm7, %%mm7 \n\t"
3625
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3626
"movq %%mm4, %%mm6 \n\t"
3627
"psubw %%mm3, %%mm5 \n\t"
3629
// pa = abs(p-a) = abs(pav)
3630
// pb = abs(p-b) = abs(pbv)
3631
// pc = abs(p-c) = abs(pcv)
3632
"pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
3633
"paddw %%mm5, %%mm6 \n\t"
3634
"pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
3635
"pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
3636
"psubw %%mm0, %%mm4 \n\t"
3637
"pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
3638
"psubw %%mm0, %%mm4 \n\t"
3639
"psubw %%mm7, %%mm5 \n\t"
3640
"pxor %%mm0, %%mm0 \n\t"
3641
"pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3642
"pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3643
"psubw %%mm7, %%mm5 \n\t"
3644
"psubw %%mm0, %%mm6 \n\t"
3646
"movq %%mm4, %%mm7 \n\t"
3647
"psubw %%mm0, %%mm6 \n\t"
3648
"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3649
"movq %%mm7, %%mm0 \n\t"
3650
// use mm7 mask to merge pa & pb
3651
"pand %%mm7, %%mm5 \n\t"
3652
// use mm0 mask copy to merge a & b
3653
"pand %%mm0, %%mm2 \n\t"
3654
"pandn %%mm4, %%mm7 \n\t"
3655
"pandn %%mm1, %%mm0 \n\t"
3656
"paddw %%mm5, %%mm7 \n\t"
3657
"paddw %%mm2, %%mm0 \n\t"
3658
// test ((pa <= pb)? pa:pb) <= pc
3659
"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3660
"pxor %%mm1, %%mm1 \n\t"
3661
"pand %%mm7, %%mm3 \n\t"
3662
"pandn %%mm0, %%mm7 \n\t"
3663
"paddw %%mm3, %%mm7 \n\t"
3664
"pxor %%mm0, %%mm0 \n\t"
3665
"packuswb %%mm1, %%mm7 \n\t"
3666
"movq (%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
3667
"pand _ActiveMask, %%mm7 \n\t"
3668
"movq %%mm3, %%mm2 \n\t" // load b=Prior(x) step 1
3669
"paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
3670
"punpcklbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3671
"movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
3672
"movq %%mm7, %%mm1 \n\t" // now mm1 will be used as
3674
// now do Paeth for 2nd set of bytes (3-5)
3675
"psrlq _ShiftBpp, %%mm2 \n\t" // load b=Prior(x) step 2
3676
"punpcklbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
3677
"pxor %%mm7, %%mm7 \n\t"
3678
"punpcklbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
3679
// pbv = p - b = (a + b - c) - b = a - c
3680
"movq %%mm1, %%mm5 \n\t"
3681
// pav = p - a = (a + b - c) - a = b - c
3682
"movq %%mm2, %%mm4 \n\t"
3683
"psubw %%mm3, %%mm5 \n\t"
3684
"psubw %%mm3, %%mm4 \n\t"
3685
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
3686
// pav + pbv = pbv + pav
3687
"movq %%mm5, %%mm6 \n\t"
3688
"paddw %%mm4, %%mm6 \n\t"
3690
// pa = abs(p-a) = abs(pav)
3691
// pb = abs(p-b) = abs(pbv)
3692
// pc = abs(p-c) = abs(pcv)
3693
"pcmpgtw %%mm5, %%mm0 \n\t" // create mask pbv bytes < 0
3694
"pcmpgtw %%mm4, %%mm7 \n\t" // create mask pav bytes < 0
3695
"pand %%mm5, %%mm0 \n\t" // only pbv bytes < 0 in mm0
3696
"pand %%mm4, %%mm7 \n\t" // only pav bytes < 0 in mm7
3697
"psubw %%mm0, %%mm5 \n\t"
3698
"psubw %%mm7, %%mm4 \n\t"
3699
"psubw %%mm0, %%mm5 \n\t"
3700
"psubw %%mm7, %%mm4 \n\t"
3701
"pxor %%mm0, %%mm0 \n\t"
3702
"pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3703
"pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3704
"psubw %%mm0, %%mm6 \n\t"
3706
"movq %%mm4, %%mm7 \n\t"
3707
"psubw %%mm0, %%mm6 \n\t"
3708
"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3709
"movq %%mm7, %%mm0 \n\t"
3710
// use mm7 mask to merge pa & pb
3711
"pand %%mm7, %%mm5 \n\t"
3712
// use mm0 mask copy to merge a & b
3713
"pand %%mm0, %%mm2 \n\t"
3714
"pandn %%mm4, %%mm7 \n\t"
3715
"pandn %%mm1, %%mm0 \n\t"
3716
"paddw %%mm5, %%mm7 \n\t"
3717
"paddw %%mm2, %%mm0 \n\t"
3718
// test ((pa <= pb)? pa:pb) <= pc
3719
"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3720
"movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
3721
"pand %%mm7, %%mm3 \n\t"
3722
"pandn %%mm0, %%mm7 \n\t"
3723
"pxor %%mm1, %%mm1 \n\t"
3724
"paddw %%mm3, %%mm7 \n\t"
3725
"pxor %%mm0, %%mm0 \n\t"
3726
"packuswb %%mm1, %%mm7 \n\t"
3727
"movq %%mm2, %%mm3 \n\t" // load c=Prior(x-bpp) step 1
3728
"pand _ActiveMask, %%mm7 \n\t"
3729
"punpckhbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
3730
"psllq _ShiftBpp, %%mm7 \n\t" // shift bytes to 2nd group of
3732
// pav = p - a = (a + b - c) - a = b - c
3733
"movq %%mm2, %%mm4 \n\t"
3734
"paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
3735
"psllq _ShiftBpp, %%mm3 \n\t" // load c=Prior(x-bpp) step 2
3736
"movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
3737
"movq %%mm7, %%mm1 \n\t"
3738
"punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3739
"psllq _ShiftBpp, %%mm1 \n\t" // shift bytes
3740
// now mm1 will be used as Raw(x-bpp)
3741
// now do Paeth for 3rd, and final, set of bytes (6-7)
3742
"pxor %%mm7, %%mm7 \n\t"
3743
"punpckhbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
3744
"psubw %%mm3, %%mm4 \n\t"
3745
// pbv = p - b = (a + b - c) - b = a - c
3746
"movq %%mm1, %%mm5 \n\t"
3747
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3748
"movq %%mm4, %%mm6 \n\t"
3749
"psubw %%mm3, %%mm5 \n\t"
3750
"pxor %%mm0, %%mm0 \n\t"
3751
"paddw %%mm5, %%mm6 \n\t"
3753
// pa = abs(p-a) = abs(pav)
3754
// pb = abs(p-b) = abs(pbv)
3755
// pc = abs(p-c) = abs(pcv)
3756
"pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
3757
"pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
3758
"pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
3759
"pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
3760
"psubw %%mm0, %%mm4 \n\t"
3761
"psubw %%mm7, %%mm5 \n\t"
3762
"psubw %%mm0, %%mm4 \n\t"
3763
"psubw %%mm7, %%mm5 \n\t"
3764
"pxor %%mm0, %%mm0 \n\t"
3765
"pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3766
"pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3767
"psubw %%mm0, %%mm6 \n\t"
3769
"movq %%mm4, %%mm7 \n\t"
3770
"psubw %%mm0, %%mm6 \n\t"
3771
"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3772
"movq %%mm7, %%mm0 \n\t"
3773
// use mm0 mask copy to merge a & b
3774
"pand %%mm0, %%mm2 \n\t"
3775
// use mm7 mask to merge pa & pb
3776
"pand %%mm7, %%mm5 \n\t"
3777
"pandn %%mm1, %%mm0 \n\t"
3778
"pandn %%mm4, %%mm7 \n\t"
3779
"paddw %%mm2, %%mm0 \n\t"
3780
"paddw %%mm5, %%mm7 \n\t"
3781
// test ((pa <= pb)? pa:pb) <= pc
3782
"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3783
"pand %%mm7, %%mm3 \n\t"
3784
"pandn %%mm0, %%mm7 \n\t"
3785
"paddw %%mm3, %%mm7 \n\t"
3786
"pxor %%mm1, %%mm1 \n\t"
3787
"packuswb %%mm7, %%mm1 \n\t"
3788
// step ecx to next set of 8 bytes and repeat loop til done
3789
"addl $8, %%ecx \n\t"
3790
"pand _ActiveMaskEnd, %%mm1 \n\t"
3791
"paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with
3794
"cmpl _MMXLength, %%ecx \n\t"
3795
"pxor %%mm0, %%mm0 \n\t" // pxor does not affect flags
3796
"movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
3797
// mm1 will be used as Raw(x-bpp) next loop
3798
// mm3 ready to be used as Prior(x-bpp) next loop
3801
: "=S" (dummy_value_S), // output regs (dummy)
3802
"=D" (dummy_value_D)
3804
: "0" (prev_row), // esi // input regs
3807
: "%ecx" // clobber list
3808
#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3809
, "%mm0", "%mm1", "%mm2", "%mm3"
3810
, "%mm4", "%mm5", "%mm6", "%mm7"
3817
//case 7: // GRR BOGUS
3818
//case 5: // GRR BOGUS
3820
_ActiveMask.use = 0x00000000ffffffffLL;
3821
_ActiveMask2.use = 0xffffffff00000000LL;
3822
_ShiftBpp.use = bpp << 3; // == bpp * 8
3823
_ShiftRem.use = 64 - _ShiftBpp.use;
3825
__asm__ __volatile__ (
3826
"movl _dif, %%ecx \n\t"
3827
// preload "movl row, %%edi \n\t"
3828
// preload "movl prev_row, %%esi \n\t"
3829
// prime the pump: load the first Raw(x-bpp) data set
3830
"movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3831
"pxor %%mm0, %%mm0 \n\t"
3834
// must shift to position Raw(x-bpp) data
3835
"psrlq _ShiftRem, %%mm1 \n\t"
3836
// do first set of 4 bytes
3837
"movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
3838
"punpcklbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
3839
"movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
3840
"punpcklbw %%mm0, %%mm2 \n\t" // unpack Low bytes of b
3841
// must shift to position Prior(x-bpp) data
3842
"psrlq _ShiftRem, %%mm3 \n\t"
3843
// pav = p - a = (a + b - c) - a = b - c
3844
"movq %%mm2, %%mm4 \n\t"
3845
"punpcklbw %%mm0, %%mm3 \n\t" // unpack Low bytes of c
3846
// pbv = p - b = (a + b - c) - b = a - c
3847
"movq %%mm1, %%mm5 \n\t"
3848
"psubw %%mm3, %%mm4 \n\t"
3849
"pxor %%mm7, %%mm7 \n\t"
3850
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3851
"movq %%mm4, %%mm6 \n\t"
3852
"psubw %%mm3, %%mm5 \n\t"
3853
// pa = abs(p-a) = abs(pav)
3854
// pb = abs(p-b) = abs(pbv)
3855
// pc = abs(p-c) = abs(pcv)
3856
"pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
3857
"paddw %%mm5, %%mm6 \n\t"
3858
"pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
3859
"pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
3860
"psubw %%mm0, %%mm4 \n\t"
3861
"pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
3862
"psubw %%mm0, %%mm4 \n\t"
3863
"psubw %%mm7, %%mm5 \n\t"
3864
"pxor %%mm0, %%mm0 \n\t"
3865
"pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3866
"pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3867
"psubw %%mm7, %%mm5 \n\t"
3868
"psubw %%mm0, %%mm6 \n\t"
3870
"movq %%mm4, %%mm7 \n\t"
3871
"psubw %%mm0, %%mm6 \n\t"
3872
"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3873
"movq %%mm7, %%mm0 \n\t"
3874
// use mm7 mask to merge pa & pb
3875
"pand %%mm7, %%mm5 \n\t"
3876
// use mm0 mask copy to merge a & b
3877
"pand %%mm0, %%mm2 \n\t"
3878
"pandn %%mm4, %%mm7 \n\t"
3879
"pandn %%mm1, %%mm0 \n\t"
3880
"paddw %%mm5, %%mm7 \n\t"
3881
"paddw %%mm2, %%mm0 \n\t"
3882
// test ((pa <= pb)? pa:pb) <= pc
3883
"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3884
"pxor %%mm1, %%mm1 \n\t"
3885
"pand %%mm7, %%mm3 \n\t"
3886
"pandn %%mm0, %%mm7 \n\t"
3887
"paddw %%mm3, %%mm7 \n\t"
3888
"pxor %%mm0, %%mm0 \n\t"
3889
"packuswb %%mm1, %%mm7 \n\t"
3890
"movq -8(%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
3891
"pand _ActiveMask, %%mm7 \n\t"
3892
"psrlq _ShiftRem, %%mm3 \n\t"
3893
"movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x) step 1
3894
"paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor and Raw(x)
3895
"movq %%mm2, %%mm6 \n\t"
3896
"movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
3897
"movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3898
"psllq _ShiftBpp, %%mm6 \n\t"
3899
"movq %%mm7, %%mm5 \n\t"
3900
"psrlq _ShiftRem, %%mm1 \n\t"
3901
"por %%mm6, %%mm3 \n\t"
3902
"psllq _ShiftBpp, %%mm5 \n\t"
3903
"punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3904
"por %%mm5, %%mm1 \n\t"
3905
// do second set of 4 bytes
3906
"punpckhbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
3907
"punpckhbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
3908
// pav = p - a = (a + b - c) - a = b - c
3909
"movq %%mm2, %%mm4 \n\t"
3910
// pbv = p - b = (a + b - c) - b = a - c
3911
"movq %%mm1, %%mm5 \n\t"
3912
"psubw %%mm3, %%mm4 \n\t"
3913
"pxor %%mm7, %%mm7 \n\t"
3914
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3915
"movq %%mm4, %%mm6 \n\t"
3916
"psubw %%mm3, %%mm5 \n\t"
3917
// pa = abs(p-a) = abs(pav)
3918
// pb = abs(p-b) = abs(pbv)
3919
// pc = abs(p-c) = abs(pcv)
3920
"pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
3921
"paddw %%mm5, %%mm6 \n\t"
3922
"pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
3923
"pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
3924
"psubw %%mm0, %%mm4 \n\t"
3925
"pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
3926
"psubw %%mm0, %%mm4 \n\t"
3927
"psubw %%mm7, %%mm5 \n\t"
3928
"pxor %%mm0, %%mm0 \n\t"
3929
"pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3930
"pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3931
"psubw %%mm7, %%mm5 \n\t"
3932
"psubw %%mm0, %%mm6 \n\t"
3934
"movq %%mm4, %%mm7 \n\t"
3935
"psubw %%mm0, %%mm6 \n\t"
3936
"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3937
"movq %%mm7, %%mm0 \n\t"
3938
// use mm7 mask to merge pa & pb
3939
"pand %%mm7, %%mm5 \n\t"
3940
// use mm0 mask copy to merge a & b
3941
"pand %%mm0, %%mm2 \n\t"
3942
"pandn %%mm4, %%mm7 \n\t"
3943
"pandn %%mm1, %%mm0 \n\t"
3944
"paddw %%mm5, %%mm7 \n\t"
3945
"paddw %%mm2, %%mm0 \n\t"
3946
// test ((pa <= pb)? pa:pb) <= pc
3947
"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3948
"pxor %%mm1, %%mm1 \n\t"
3949
"pand %%mm7, %%mm3 \n\t"
3950
"pandn %%mm0, %%mm7 \n\t"
3951
"pxor %%mm1, %%mm1 \n\t"
3952
"paddw %%mm3, %%mm7 \n\t"
3953
"pxor %%mm0, %%mm0 \n\t"
3954
// step ecx to next set of 8 bytes and repeat loop til done
3955
"addl $8, %%ecx \n\t"
3956
"packuswb %%mm7, %%mm1 \n\t"
3957
"paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
3958
"cmpl _MMXLength, %%ecx \n\t"
3959
"movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
3960
// mm1 will be used as Raw(x-bpp) next loop
3963
: "=S" (dummy_value_S), // output regs (dummy)
3964
"=D" (dummy_value_D)
3966
: "0" (prev_row), // esi // input regs
3969
: "%ecx" // clobber list
3970
#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3971
, "%mm0", "%mm1", "%mm2", "%mm3"
3972
, "%mm4", "%mm5", "%mm6", "%mm7"
3980
_ActiveMask.use = 0x00000000ffffffffLL;
3982
__asm__ __volatile__ (
3983
"movl _dif, %%ecx \n\t"
3984
// preload "movl row, %%edi \n\t"
3985
// preload "movl prev_row, %%esi \n\t"
3986
"pxor %%mm0, %%mm0 \n\t"
3987
// prime the pump: load the first Raw(x-bpp) data set
3988
"movq -8(%%edi,%%ecx,), %%mm1 \n\t" // only time should need to read
3989
// a=Raw(x-bpp) bytes
3991
// do first set of 4 bytes
3992
"movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
3993
"punpckhbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
3994
"movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
3995
"punpcklbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
3996
// pav = p - a = (a + b - c) - a = b - c
3997
"movq %%mm2, %%mm4 \n\t"
3998
"punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3999
// pbv = p - b = (a + b - c) - b = a - c
4000
"movq %%mm1, %%mm5 \n\t"
4001
"psubw %%mm3, %%mm4 \n\t"
4002
"pxor %%mm7, %%mm7 \n\t"
4003
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4004
"movq %%mm4, %%mm6 \n\t"
4005
"psubw %%mm3, %%mm5 \n\t"
4006
// pa = abs(p-a) = abs(pav)
4007
// pb = abs(p-b) = abs(pbv)
4008
// pc = abs(p-c) = abs(pcv)
4009
"pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
4010
"paddw %%mm5, %%mm6 \n\t"
4011
"pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
4012
"pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
4013
"psubw %%mm0, %%mm4 \n\t"
4014
"pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
4015
"psubw %%mm0, %%mm4 \n\t"
4016
"psubw %%mm7, %%mm5 \n\t"
4017
"pxor %%mm0, %%mm0 \n\t"
4018
"pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
4019
"pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
4020
"psubw %%mm7, %%mm5 \n\t"
4021
"psubw %%mm0, %%mm6 \n\t"
4023
"movq %%mm4, %%mm7 \n\t"
4024
"psubw %%mm0, %%mm6 \n\t"
4025
"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
4026
"movq %%mm7, %%mm0 \n\t"
4027
// use mm7 mask to merge pa & pb
4028
"pand %%mm7, %%mm5 \n\t"
4029
// use mm0 mask copy to merge a & b
4030
"pand %%mm0, %%mm2 \n\t"
4031
"pandn %%mm4, %%mm7 \n\t"
4032
"pandn %%mm1, %%mm0 \n\t"
4033
"paddw %%mm5, %%mm7 \n\t"
4034
"paddw %%mm2, %%mm0 \n\t"
4035
// test ((pa <= pb)? pa:pb) <= pc
4036
"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
4037
"pxor %%mm1, %%mm1 \n\t"
4038
"pand %%mm7, %%mm3 \n\t"
4039
"pandn %%mm0, %%mm7 \n\t"
4040
"paddw %%mm3, %%mm7 \n\t"
4041
"pxor %%mm0, %%mm0 \n\t"
4042
"packuswb %%mm1, %%mm7 \n\t"
4043
"movq (%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
4044
"pand _ActiveMask, %%mm7 \n\t"
4045
"movq %%mm3, %%mm2 \n\t" // load b=Prior(x) step 1
4046
"paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
4047
"punpcklbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
4048
"movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
4049
"movq %%mm7, %%mm1 \n\t" // now mm1 will be used as Raw(x-bpp)
4050
// do second set of 4 bytes
4051
"punpckhbw %%mm0, %%mm2 \n\t" // unpack Low bytes of b
4052
"punpcklbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
4053
// pav = p - a = (a + b - c) - a = b - c
4054
"movq %%mm2, %%mm4 \n\t"
4055
// pbv = p - b = (a + b - c) - b = a - c
4056
"movq %%mm1, %%mm5 \n\t"
4057
"psubw %%mm3, %%mm4 \n\t"
4058
"pxor %%mm7, %%mm7 \n\t"
4059
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4060
"movq %%mm4, %%mm6 \n\t"
4061
"psubw %%mm3, %%mm5 \n\t"
4062
// pa = abs(p-a) = abs(pav)
4063
// pb = abs(p-b) = abs(pbv)
4064
// pc = abs(p-c) = abs(pcv)
4065
"pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
4066
"paddw %%mm5, %%mm6 \n\t"
4067
"pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
4068
"pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
4069
"psubw %%mm0, %%mm4 \n\t"
4070
"pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
4071
"psubw %%mm0, %%mm4 \n\t"
4072
"psubw %%mm7, %%mm5 \n\t"
4073
"pxor %%mm0, %%mm0 \n\t"
4074
"pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
4075
"pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
4076
"psubw %%mm7, %%mm5 \n\t"
4077
"psubw %%mm0, %%mm6 \n\t"
4079
"movq %%mm4, %%mm7 \n\t"
4080
"psubw %%mm0, %%mm6 \n\t"
4081
"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
4082
"movq %%mm7, %%mm0 \n\t"
4083
// use mm7 mask to merge pa & pb
4084
"pand %%mm7, %%mm5 \n\t"
4085
// use mm0 mask copy to merge a & b
4086
"pand %%mm0, %%mm2 \n\t"
4087
"pandn %%mm4, %%mm7 \n\t"
4088
"pandn %%mm1, %%mm0 \n\t"
4089
"paddw %%mm5, %%mm7 \n\t"
4090
"paddw %%mm2, %%mm0 \n\t"
4091
// test ((pa <= pb)? pa:pb) <= pc
4092
"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
4093
"pxor %%mm1, %%mm1 \n\t"
4094
"pand %%mm7, %%mm3 \n\t"
4095
"pandn %%mm0, %%mm7 \n\t"
4096
"pxor %%mm1, %%mm1 \n\t"
4097
"paddw %%mm3, %%mm7 \n\t"
4098
"pxor %%mm0, %%mm0 \n\t"
4099
// step ecx to next set of 8 bytes and repeat loop til done
4100
"addl $8, %%ecx \n\t"
4101
"packuswb %%mm7, %%mm1 \n\t"
4102
"paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add predictor with Raw(x)
4103
"cmpl _MMXLength, %%ecx \n\t"
4104
"movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
4105
// mm1 will be used as Raw(x-bpp) next loop
4108
: "=S" (dummy_value_S), // output regs (dummy)
4109
"=D" (dummy_value_D)
4111
: "0" (prev_row), // esi // input regs
4114
: "%ecx" // clobber list
4115
#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
4116
, "%mm0", "%mm1", "%mm2", "%mm3"
4117
, "%mm4", "%mm5", "%mm6", "%mm7"
4125
_ActiveMask.use = 0x00000000ffffffffLL;
4127
__asm__ __volatile__ (
4128
"movl _dif, %%ecx \n\t"
4129
// preload "movl row, %%edi \n\t"
4130
// preload "movl prev_row, %%esi \n\t"
4131
"pxor %%mm0, %%mm0 \n\t"
4132
// prime the pump: load the first Raw(x-bpp) data set
4133
"movq -8(%%edi,%%ecx,), %%mm1 \n\t" // only time should need to read
4134
// a=Raw(x-bpp) bytes
4136
// do first set of 4 bytes
4137
"movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
4138
"punpcklbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
4139
"movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
4140
"punpcklbw %%mm0, %%mm2 \n\t" // unpack Low bytes of b
4141
// pav = p - a = (a + b - c) - a = b - c
4142
"movq %%mm2, %%mm4 \n\t"
4143
"punpcklbw %%mm0, %%mm3 \n\t" // unpack Low bytes of c
4144
// pbv = p - b = (a + b - c) - b = a - c
4145
"movq %%mm1, %%mm5 \n\t"
4146
"psubw %%mm3, %%mm4 \n\t"
4147
"pxor %%mm7, %%mm7 \n\t"
4148
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4149
"movq %%mm4, %%mm6 \n\t"
4150
"psubw %%mm3, %%mm5 \n\t"
4151
// pa = abs(p-a) = abs(pav)
4152
// pb = abs(p-b) = abs(pbv)
4153
// pc = abs(p-c) = abs(pcv)
4154
"pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
4155
"paddw %%mm5, %%mm6 \n\t"
4156
"pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
4157
"pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
4158
"psubw %%mm0, %%mm4 \n\t"
4159
"pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
4160
"psubw %%mm0, %%mm4 \n\t"
4161
"psubw %%mm7, %%mm5 \n\t"
4162
"pxor %%mm0, %%mm0 \n\t"
4163
"pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
4164
"pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
4165
"psubw %%mm7, %%mm5 \n\t"
4166
"psubw %%mm0, %%mm6 \n\t"
4168
"movq %%mm4, %%mm7 \n\t"
4169
"psubw %%mm0, %%mm6 \n\t"
4170
"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
4171
"movq %%mm7, %%mm0 \n\t"
4172
// use mm7 mask to merge pa & pb
4173
"pand %%mm7, %%mm5 \n\t"
4174
// use mm0 mask copy to merge a & b
4175
"pand %%mm0, %%mm2 \n\t"
4176
"pandn %%mm4, %%mm7 \n\t"
4177
"pandn %%mm1, %%mm0 \n\t"
4178
"paddw %%mm5, %%mm7 \n\t"
4179
"paddw %%mm2, %%mm0 \n\t"
4180
// test ((pa <= pb)? pa:pb) <= pc
4181
"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
4182
"pxor %%mm1, %%mm1 \n\t"
4183
"pand %%mm7, %%mm3 \n\t"
4184
"pandn %%mm0, %%mm7 \n\t"
4185
"paddw %%mm3, %%mm7 \n\t"
4186
"pxor %%mm0, %%mm0 \n\t"
4187
"packuswb %%mm1, %%mm7 \n\t"
4188
"movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
4189
"pand _ActiveMask, %%mm7 \n\t"
4190
"movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
4191
"paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
4192
"punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
4193
"movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
4194
"movq -8(%%edi,%%ecx,), %%mm1 \n\t" // read a=Raw(x-bpp) bytes
4196
// do second set of 4 bytes
4197
"punpckhbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
4198
"punpckhbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
4199
// pav = p - a = (a + b - c) - a = b - c
4200
"movq %%mm2, %%mm4 \n\t"
4201
// pbv = p - b = (a + b - c) - b = a - c
4202
"movq %%mm1, %%mm5 \n\t"
4203
"psubw %%mm3, %%mm4 \n\t"
4204
"pxor %%mm7, %%mm7 \n\t"
4205
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4206
"movq %%mm4, %%mm6 \n\t"
4207
"psubw %%mm3, %%mm5 \n\t"
4208
// pa = abs(p-a) = abs(pav)
4209
// pb = abs(p-b) = abs(pbv)
4210
// pc = abs(p-c) = abs(pcv)
4211
"pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
4212
"paddw %%mm5, %%mm6 \n\t"
4213
"pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
4214
"pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
4215
"psubw %%mm0, %%mm4 \n\t"
4216
"pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
4217
"psubw %%mm0, %%mm4 \n\t"
4218
"psubw %%mm7, %%mm5 \n\t"
4219
"pxor %%mm0, %%mm0 \n\t"
4220
"pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
4221
"pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
4222
"psubw %%mm7, %%mm5 \n\t"
4223
"psubw %%mm0, %%mm6 \n\t"
4225
"movq %%mm4, %%mm7 \n\t"
4226
"psubw %%mm0, %%mm6 \n\t"
4227
"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
4228
"movq %%mm7, %%mm0 \n\t"
4229
// use mm7 mask to merge pa & pb
4230
"pand %%mm7, %%mm5 \n\t"
4231
// use mm0 mask copy to merge a & b
4232
"pand %%mm0, %%mm2 \n\t"
4233
"pandn %%mm4, %%mm7 \n\t"
4234
"pandn %%mm1, %%mm0 \n\t"
4235
"paddw %%mm5, %%mm7 \n\t"
4236
"paddw %%mm2, %%mm0 \n\t"
4237
// test ((pa <= pb)? pa:pb) <= pc
4238
"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
4239
"pxor %%mm1, %%mm1 \n\t"
4240
"pand %%mm7, %%mm3 \n\t"
4241
"pandn %%mm0, %%mm7 \n\t"
4242
"pxor %%mm1, %%mm1 \n\t"
4243
"paddw %%mm3, %%mm7 \n\t"
4244
"pxor %%mm0, %%mm0 \n\t"
4245
// step ecx to next set of 8 bytes and repeat loop til done
4246
"addl $8, %%ecx \n\t"
4247
"packuswb %%mm7, %%mm1 \n\t"
4248
"paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
4249
"cmpl _MMXLength, %%ecx \n\t"
4250
"movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
4251
// mm1 will be used as Raw(x-bpp) next loop
4254
: "=S" (dummy_value_S), // output regs (dummy)
4255
"=D" (dummy_value_D)
4257
: "0" (prev_row), // esi // input regs
4260
: "%ecx" // clobber list
4261
#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
4262
, "%mm0", "%mm1", "%mm2", "%mm3"
4263
, "%mm4", "%mm5", "%mm6", "%mm7"
4273
__asm__ __volatile__ (
4275
"pushl %%ebx \n\t" // save Global Offset Table index
4277
"movl _dif, %%ebx \n\t"
4278
"cmpl _FullLength, %%ebx \n\t"
4279
"jnb paeth_dend \n\t"
4281
// preload "movl row, %%edi \n\t"
4282
// preload "movl prev_row, %%esi \n\t"
4283
// do Paeth decode for remaining bytes
4284
"movl %%ebx, %%edx \n\t"
4285
// preload "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
4286
"subl %%ecx, %%edx \n\t" // edx = ebx - bpp
4287
"xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx
4290
"xorl %%eax, %%eax \n\t"
4291
// pav = p - a = (a + b - c) - a = b - c
4292
"movb (%%esi,%%ebx,), %%al \n\t" // load Prior(x) into al
4293
"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4294
"subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
4295
"movl %%eax, _patemp \n\t" // Save pav for later use
4296
"xorl %%eax, %%eax \n\t"
4297
// pbv = p - b = (a + b - c) - b = a - c
4298
"movb (%%edi,%%edx,), %%al \n\t" // load Raw(x-bpp) into al
4299
"subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
4300
"movl %%eax, %%ecx \n\t"
4301
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4302
"addl _patemp, %%eax \n\t" // pcv = pav + pbv
4304
"testl $0x80000000, %%eax \n\t"
4305
"jz paeth_dpca \n\t"
4306
"negl %%eax \n\t" // reverse sign of neg values
4309
"movl %%eax, _pctemp \n\t" // save pc for later use
4311
"testl $0x80000000, %%ecx \n\t"
4312
"jz paeth_dpba \n\t"
4313
"negl %%ecx \n\t" // reverse sign of neg values
4316
"movl %%ecx, _pbtemp \n\t" // save pb for later use
4318
"movl _patemp, %%eax \n\t"
4319
"testl $0x80000000, %%eax \n\t"
4320
"jz paeth_dpaa \n\t"
4321
"negl %%eax \n\t" // reverse sign of neg values
4324
"movl %%eax, _patemp \n\t" // save pa for later use
4326
"cmpl %%ecx, %%eax \n\t"
4327
"jna paeth_dabb \n\t"
4328
// pa > pb; now test if pb <= pc
4329
"cmpl _pctemp, %%ecx \n\t"
4330
"jna paeth_dbbc \n\t"
4331
// pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4332
"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4333
"jmp paeth_dpaeth \n\t"
4336
// pb <= pc; Raw(x) = Paeth(x) + Prior(x)
4337
"movb (%%esi,%%ebx,), %%cl \n\t" // load Prior(x) into cl
4338
"jmp paeth_dpaeth \n\t"
4341
// pa <= pb; now test if pa <= pc
4342
"cmpl _pctemp, %%eax \n\t"
4343
"jna paeth_dabc \n\t"
4344
// pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4345
"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4346
"jmp paeth_dpaeth \n\t"
4349
// pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
4350
"movb (%%edi,%%edx,), %%cl \n\t" // load Raw(x-bpp) into cl
4352
"paeth_dpaeth: \n\t"
4355
// Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
4356
"addb %%cl, -1(%%edi,%%ebx,) \n\t"
4357
"cmpl _FullLength, %%ebx \n\t"
4362
"popl %%ebx \n\t" // index to Global Offset Table
4365
: "=c" (dummy_value_c), // output regs (dummy)
4366
"=S" (dummy_value_S),
4367
"=D" (dummy_value_D)
4369
: "0" (bpp), // ecx // input regs
4370
"1" (prev_row), // esi
4373
: "%eax", "%edx" // clobber list
4379
return; // No need to go further with this one
4381
} // end switch (bpp)
4383
__asm__ __volatile__ (
4384
// MMX acceleration complete; now do clean-up
4385
// check if any remaining bytes left to decode
4387
"pushl %%ebx \n\t" // save index to Global Offset Table
4389
"movl _MMXLength, %%ebx \n\t"
4390
"cmpl _FullLength, %%ebx \n\t"
4391
"jnb paeth_end \n\t"
4392
//pre "movl row, %%edi \n\t"
4393
//pre "movl prev_row, %%esi \n\t"
4394
// do Paeth decode for remaining bytes
4395
"movl %%ebx, %%edx \n\t"
4396
//pre "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
4397
"subl %%ecx, %%edx \n\t" // edx = ebx - bpp
4398
"xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx below
4401
"xorl %%eax, %%eax \n\t"
4402
// pav = p - a = (a + b - c) - a = b - c
4403
"movb (%%esi,%%ebx,), %%al \n\t" // load Prior(x) into al
4404
"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4405
"subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
4406
"movl %%eax, _patemp \n\t" // Save pav for later use
4407
"xorl %%eax, %%eax \n\t"
4408
// pbv = p - b = (a + b - c) - b = a - c
4409
"movb (%%edi,%%edx,), %%al \n\t" // load Raw(x-bpp) into al
4410
"subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
4411
"movl %%eax, %%ecx \n\t"
4412
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4413
"addl _patemp, %%eax \n\t" // pcv = pav + pbv
4415
"testl $0x80000000, %%eax \n\t"
4416
"jz paeth_pca2 \n\t"
4417
"negl %%eax \n\t" // reverse sign of neg values
4420
"movl %%eax, _pctemp \n\t" // save pc for later use
4422
"testl $0x80000000, %%ecx \n\t"
4423
"jz paeth_pba2 \n\t"
4424
"negl %%ecx \n\t" // reverse sign of neg values
4427
"movl %%ecx, _pbtemp \n\t" // save pb for later use
4429
"movl _patemp, %%eax \n\t"
4430
"testl $0x80000000, %%eax \n\t"
4431
"jz paeth_paa2 \n\t"
4432
"negl %%eax \n\t" // reverse sign of neg values
4435
"movl %%eax, _patemp \n\t" // save pa for later use
4437
"cmpl %%ecx, %%eax \n\t"
4438
"jna paeth_abb2 \n\t"
4439
// pa > pb; now test if pb <= pc
4440
"cmpl _pctemp, %%ecx \n\t"
4441
"jna paeth_bbc2 \n\t"
4442
// pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4443
"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4444
"jmp paeth_paeth2 \n\t"
4447
// pb <= pc; Raw(x) = Paeth(x) + Prior(x)
4448
"movb (%%esi,%%ebx,), %%cl \n\t" // load Prior(x) into cl
4449
"jmp paeth_paeth2 \n\t"
4452
// pa <= pb; now test if pa <= pc
4453
"cmpl _pctemp, %%eax \n\t"
4454
"jna paeth_abc2 \n\t"
4455
// pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4456
"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4457
"jmp paeth_paeth2 \n\t"
4460
// pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
4461
"movb (%%edi,%%edx,), %%cl \n\t" // load Raw(x-bpp) into cl
4463
"paeth_paeth2: \n\t"
4466
// Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
4467
"addb %%cl, -1(%%edi,%%ebx,) \n\t"
4468
"cmpl _FullLength, %%ebx \n\t"
4472
"EMMS \n\t" // end MMX; prep for poss. FP instrs.
4474
"popl %%ebx \n\t" // restore index to Global Offset Table
4477
: "=c" (dummy_value_c), // output regs (dummy)
4478
"=S" (dummy_value_S),
4479
"=D" (dummy_value_D)
4481
: "0" (bpp), // ecx // input regs
4482
"1" (prev_row), // esi
4485
: "%eax", "%edx" // clobber list (no input regs!)
4491
} /* end png_read_filter_row_mmx_paeth() */
4497
#ifdef PNG_THREAD_UNSAFE_OK
4498
//===========================================================================//
4500
// P N G _ R E A D _ F I L T E R _ R O W _ M M X _ S U B //
4502
//===========================================================================//
4504
// Optimized code for PNG Sub filter decoder
4506
static void /* PRIVATE */
4507
png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
4513
bpp = (row_info->pixel_depth + 7) >> 3; // calc number of bytes per pixel
4514
_FullLength = row_info->rowbytes - bpp; // number of bytes to filter
4516
__asm__ __volatile__ (
4517
//pre "movl row, %%edi \n\t"
4518
"movl %%edi, %%esi \n\t" // lp = row
4519
//pre "movl bpp, %%eax \n\t"
4520
"addl %%eax, %%edi \n\t" // rp = row + bpp
4521
//irr "xorl %%eax, %%eax \n\t"
4522
// get # of bytes to alignment
4523
"movl %%edi, _dif \n\t" // take start of row
4524
"addl $0xf, _dif \n\t" // add 7 + 8 to incr past
4525
// alignment boundary
4526
"xorl %%ecx, %%ecx \n\t"
4527
"andl $0xfffffff8, _dif \n\t" // mask to alignment boundary
4528
"subl %%edi, _dif \n\t" // subtract from start ==> value
4529
"jz sub_go \n\t" // ecx at alignment
4531
"sub_lp1: \n\t" // fix alignment
4532
"movb (%%esi,%%ecx,), %%al \n\t"
4533
"addb %%al, (%%edi,%%ecx,) \n\t"
4535
"cmpl _dif, %%ecx \n\t"
4539
"movl _FullLength, %%eax \n\t"
4540
"movl %%eax, %%edx \n\t"
4541
"subl %%ecx, %%edx \n\t" // subtract alignment fix
4542
"andl $0x00000007, %%edx \n\t" // calc bytes over mult of 8
4543
"subl %%edx, %%eax \n\t" // drop over bytes from length
4544
"movl %%eax, _MMXLength \n\t"
4546
: "=a" (dummy_value_a), // 0 // output regs (dummy)
4547
"=D" (dummy_value_D) // 1
4549
: "0" (bpp), // eax // input regs
4552
: "%esi", "%ecx", "%edx" // clobber list
4554
#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4555
, "%mm0", "%mm1", "%mm2", "%mm3"
4556
, "%mm4", "%mm5", "%mm6", "%mm7"
4560
// now do the math for the rest of the row
4565
_ActiveMask.use = 0x0000ffffff000000LL;
4566
_ShiftBpp.use = 24; // == 3 * 8
4567
_ShiftRem.use = 40; // == 64 - 24
4569
__asm__ __volatile__ (
4570
// preload "movl row, %%edi \n\t"
4571
"movq _ActiveMask, %%mm7 \n\t" // load _ActiveMask for 2nd
4572
// active byte group
4573
"movl %%edi, %%esi \n\t" // lp = row
4574
// preload "movl bpp, %%eax \n\t"
4575
"addl %%eax, %%edi \n\t" // rp = row + bpp
4576
"movq %%mm7, %%mm6 \n\t"
4577
"movl _dif, %%edx \n\t"
4578
"psllq _ShiftBpp, %%mm6 \n\t" // move mask in mm6 to cover
4579
// 3rd active byte group
4580
// prime the pump: load the first Raw(x-bpp) data set
4581
"movq -8(%%edi,%%edx,), %%mm1 \n\t"
4583
"sub_3lp: \n\t" // shift data for adding first
4584
"psrlq _ShiftRem, %%mm1 \n\t" // bpp bytes (no need for mask;
4585
// shift clears inactive bytes)
4586
// add 1st active group
4587
"movq (%%edi,%%edx,), %%mm0 \n\t"
4588
"paddb %%mm1, %%mm0 \n\t"
4590
// add 2nd active group
4591
"movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4592
"psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4593
"pand %%mm7, %%mm1 \n\t" // mask to use 2nd active group
4594
"paddb %%mm1, %%mm0 \n\t"
4596
// add 3rd active group
4597
"movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4598
"psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4599
"pand %%mm6, %%mm1 \n\t" // mask to use 3rd active group
4600
"addl $8, %%edx \n\t"
4601
"paddb %%mm1, %%mm0 \n\t"
4603
"cmpl _MMXLength, %%edx \n\t"
4604
"movq %%mm0, -8(%%edi,%%edx,) \n\t" // write updated Raws to array
4605
"movq %%mm0, %%mm1 \n\t" // prep 1st add at top of loop
4608
: "=a" (dummy_value_a), // 0 // output regs (dummy)
4609
"=D" (dummy_value_D) // 1
4611
: "0" (bpp), // eax // input regs
4614
: "%edx", "%esi" // clobber list
4615
#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4616
, "%mm0", "%mm1", "%mm6", "%mm7"
4624
__asm__ __volatile__ (
4625
"movl _dif, %%edx \n\t"
4626
// preload "movl row, %%edi \n\t"
4627
"cmpl _FullLength, %%edx \n\t"
4629
"movl %%edi, %%esi \n\t" // lp = row
4630
"xorl %%eax, %%eax \n\t"
4631
// preload "movl bpp, %%eax \n\t"
4632
"addl %%eax, %%edi \n\t" // rp = row + bpp
4635
"movb (%%esi,%%edx,), %%al \n\t"
4636
"addb %%al, (%%edi,%%edx,) \n\t"
4638
"cmpl _FullLength, %%edx \n\t"
4643
: "=a" (dummy_value_a), // 0 // output regs (dummy)
4644
"=D" (dummy_value_D) // 1
4646
: "0" (bpp), // eax // input regs
4649
: "%edx", "%esi" // clobber list
4656
//case 7: // GRR BOGUS
4657
//case 5: // GRR BOGUS
4659
_ShiftBpp.use = bpp << 3;
4660
_ShiftRem.use = 64 - _ShiftBpp.use;
4662
__asm__ __volatile__ (
4663
// preload "movl row, %%edi \n\t"
4664
"movl _dif, %%edx \n\t"
4665
"movl %%edi, %%esi \n\t" // lp = row
4666
// preload "movl bpp, %%eax \n\t"
4667
"addl %%eax, %%edi \n\t" // rp = row + bpp
4669
// prime the pump: load the first Raw(x-bpp) data set
4670
"movq -8(%%edi,%%edx,), %%mm1 \n\t"
4672
"sub_4lp: \n\t" // shift data for adding first
4673
"psrlq _ShiftRem, %%mm1 \n\t" // bpp bytes (no need for mask;
4674
// shift clears inactive bytes)
4675
"movq (%%edi,%%edx,), %%mm0 \n\t"
4676
"paddb %%mm1, %%mm0 \n\t"
4678
// add 2nd active group
4679
"movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4680
"psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4681
"addl $8, %%edx \n\t"
4682
"paddb %%mm1, %%mm0 \n\t"
4684
"cmpl _MMXLength, %%edx \n\t"
4685
"movq %%mm0, -8(%%edi,%%edx,) \n\t"
4686
"movq %%mm0, %%mm1 \n\t" // prep 1st add at top of loop
4689
: "=a" (dummy_value_a), // 0 // output regs (dummy)
4690
"=D" (dummy_value_D) // 1
4692
: "0" (bpp), // eax // input regs
4695
: "%edx", "%esi" // clobber list
4696
#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4705
_ActiveMask.use = 0x00000000ffff0000LL;
4706
_ShiftBpp.use = 16; // == 2 * 8
4707
_ShiftRem.use = 48; // == 64 - 16
4709
__asm__ __volatile__ (
4710
"movq _ActiveMask, %%mm7 \n\t" // load _ActiveMask for 2nd
4711
// active byte group
4712
"movl _dif, %%edx \n\t"
4713
"movq %%mm7, %%mm6 \n\t"
4714
// preload "movl row, %%edi \n\t"
4715
"psllq _ShiftBpp, %%mm6 \n\t" // move mask in mm6 to cover
4716
// 3rd active byte group
4717
"movl %%edi, %%esi \n\t" // lp = row
4718
"movq %%mm6, %%mm5 \n\t"
4719
// preload "movl bpp, %%eax \n\t"
4720
"addl %%eax, %%edi \n\t" // rp = row + bpp
4721
"psllq _ShiftBpp, %%mm5 \n\t" // move mask in mm5 to cover
4722
// 4th active byte group
4723
// prime the pump: load the first Raw(x-bpp) data set
4724
"movq -8(%%edi,%%edx,), %%mm1 \n\t"
4726
"sub_2lp: \n\t" // shift data for adding first
4727
"psrlq _ShiftRem, %%mm1 \n\t" // bpp bytes (no need for mask;
4728
// shift clears inactive bytes)
4729
// add 1st active group
4730
"movq (%%edi,%%edx,), %%mm0 \n\t"
4731
"paddb %%mm1, %%mm0 \n\t"
4733
// add 2nd active group
4734
"movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4735
"psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4736
"pand %%mm7, %%mm1 \n\t" // mask to use 2nd active group
4737
"paddb %%mm1, %%mm0 \n\t"
4739
// add 3rd active group
4740
"movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4741
"psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4742
"pand %%mm6, %%mm1 \n\t" // mask to use 3rd active group
4743
"paddb %%mm1, %%mm0 \n\t"
4745
// add 4th active group
4746
"movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4747
"psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4748
"pand %%mm5, %%mm1 \n\t" // mask to use 4th active group
4749
"addl $8, %%edx \n\t"
4750
"paddb %%mm1, %%mm0 \n\t"
4751
"cmpl _MMXLength, %%edx \n\t"
4752
"movq %%mm0, -8(%%edi,%%edx,) \n\t" // write updated Raws to array
4753
"movq %%mm0, %%mm1 \n\t" // prep 1st add at top of loop
4756
: "=a" (dummy_value_a), // 0 // output regs (dummy)
4757
"=D" (dummy_value_D) // 1
4759
: "0" (bpp), // eax // input regs
4762
: "%edx", "%esi" // clobber list
4763
#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4764
, "%mm0", "%mm1", "%mm5", "%mm6", "%mm7"
4772
__asm__ __volatile__ (
4773
// preload "movl row, %%edi \n\t"
4774
"movl _dif, %%edx \n\t"
4775
"movl %%edi, %%esi \n\t" // lp = row
4776
// preload "movl bpp, %%eax \n\t"
4777
"addl %%eax, %%edi \n\t" // rp = row + bpp
4778
"movl _MMXLength, %%ecx \n\t"
4780
// prime the pump: load the first Raw(x-bpp) data set
4781
"movq -8(%%edi,%%edx,), %%mm7 \n\t"
4782
"andl $0x0000003f, %%ecx \n\t" // calc bytes over mult of 64
4785
"movq (%%edi,%%edx,), %%mm0 \n\t" // load Sub(x) for 1st 8 bytes
4786
"paddb %%mm7, %%mm0 \n\t"
4787
"movq 8(%%edi,%%edx,), %%mm1 \n\t" // load Sub(x) for 2nd 8 bytes
4788
"movq %%mm0, (%%edi,%%edx,) \n\t" // write Raw(x) for 1st 8 bytes
4790
// Now mm0 will be used as Raw(x-bpp) for the 2nd group of 8 bytes.
4791
// This will be repeated for each group of 8 bytes with the 8th
4792
// group being used as the Raw(x-bpp) for the 1st group of the
4795
"paddb %%mm0, %%mm1 \n\t"
4796
"movq 16(%%edi,%%edx,), %%mm2 \n\t" // load Sub(x) for 3rd 8 bytes
4797
"movq %%mm1, 8(%%edi,%%edx,) \n\t" // write Raw(x) for 2nd 8 bytes
4798
"paddb %%mm1, %%mm2 \n\t"
4799
"movq 24(%%edi,%%edx,), %%mm3 \n\t" // load Sub(x) for 4th 8 bytes
4800
"movq %%mm2, 16(%%edi,%%edx,) \n\t" // write Raw(x) for 3rd 8 bytes
4801
"paddb %%mm2, %%mm3 \n\t"
4802
"movq 32(%%edi,%%edx,), %%mm4 \n\t" // load Sub(x) for 5th 8 bytes
4803
"movq %%mm3, 24(%%edi,%%edx,) \n\t" // write Raw(x) for 4th 8 bytes
4804
"paddb %%mm3, %%mm4 \n\t"
4805
"movq 40(%%edi,%%edx,), %%mm5 \n\t" // load Sub(x) for 6th 8 bytes
4806
"movq %%mm4, 32(%%edi,%%edx,) \n\t" // write Raw(x) for 5th 8 bytes
4807
"paddb %%mm4, %%mm5 \n\t"
4808
"movq 48(%%edi,%%edx,), %%mm6 \n\t" // load Sub(x) for 7th 8 bytes
4809
"movq %%mm5, 40(%%edi,%%edx,) \n\t" // write Raw(x) for 6th 8 bytes
4810
"paddb %%mm5, %%mm6 \n\t"
4811
"movq 56(%%edi,%%edx,), %%mm7 \n\t" // load Sub(x) for 8th 8 bytes
4812
"movq %%mm6, 48(%%edi,%%edx,) \n\t" // write Raw(x) for 7th 8 bytes
4813
"addl $64, %%edx \n\t"
4814
"paddb %%mm6, %%mm7 \n\t"
4815
"cmpl %%ecx, %%edx \n\t"
4816
"movq %%mm7, -8(%%edi,%%edx,) \n\t" // write Raw(x) for 8th 8 bytes
4819
"cmpl _MMXLength, %%edx \n\t"
4823
"movq (%%edi,%%edx,), %%mm0 \n\t"
4824
"addl $8, %%edx \n\t"
4825
"paddb %%mm7, %%mm0 \n\t"
4826
"cmpl _MMXLength, %%edx \n\t"
4827
"movq %%mm0, -8(%%edi,%%edx,) \n\t" // -8 to offset early addl edx
4828
"movq %%mm0, %%mm7 \n\t" // move calculated Raw(x) data
4829
// to mm1 to be new Raw(x-bpp)
4835
: "=a" (dummy_value_a), // 0 // output regs (dummy)
4836
"=D" (dummy_value_D) // 1
4838
: "0" (bpp), // eax // input regs
4841
: "%ecx", "%edx", "%esi" // clobber list
4842
#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4843
, "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"
4849
default: // bpp greater than 8 bytes GRR BOGUS
4851
__asm__ __volatile__ (
4852
"movl _dif, %%edx \n\t"
4853
// preload "movl row, %%edi \n\t"
4854
"movl %%edi, %%esi \n\t" // lp = row
4855
// preload "movl bpp, %%eax \n\t"
4856
"addl %%eax, %%edi \n\t" // rp = row + bpp
4859
"movq (%%edi,%%edx,), %%mm0 \n\t"
4860
"movq (%%esi,%%edx,), %%mm1 \n\t"
4861
"addl $8, %%edx \n\t"
4862
"paddb %%mm1, %%mm0 \n\t"
4863
"cmpl _MMXLength, %%edx \n\t"
4864
"movq %%mm0, -8(%%edi,%%edx,) \n\t" // mov does not affect flags;
4865
// -8 to offset addl edx
4868
: "=a" (dummy_value_a), // 0 // output regs (dummy)
4869
"=D" (dummy_value_D) // 1
4871
: "0" (bpp), // eax // input regs
4874
: "%edx", "%esi" // clobber list
4875
#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4882
} // end switch (bpp)
4884
__asm__ __volatile__ (
4885
"movl _MMXLength, %%edx \n\t"
4886
//pre "movl row, %%edi \n\t"
4887
"cmpl _FullLength, %%edx \n\t"
4890
"movl %%edi, %%esi \n\t" // lp = row
4891
//pre "movl bpp, %%eax \n\t"
4892
"addl %%eax, %%edi \n\t" // rp = row + bpp
4893
"xorl %%eax, %%eax \n\t"
4896
"movb (%%esi,%%edx,), %%al \n\t"
4897
"addb %%al, (%%edi,%%edx,) \n\t"
4899
"cmpl _FullLength, %%edx \n\t"
4903
"EMMS \n\t" // end MMX instructions
4905
: "=a" (dummy_value_a), // 0 // output regs (dummy)
4906
"=D" (dummy_value_D) // 1
4908
: "0" (bpp), // eax // input regs
4911
: "%edx", "%esi" // clobber list
4914
} // end of png_read_filter_row_mmx_sub()
4920
//===========================================================================//
4922
// P N G _ R E A D _ F I L T E R _ R O W _ M M X _ U P //
4924
//===========================================================================//
4926
// Optimized code for PNG Up filter decoder
4928
static void /* PRIVATE */
4929
png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
4933
int dummy_value_d; // fix 'forbidden register 3 (dx) was spilled' error
4937
len = row_info->rowbytes; // number of bytes to filter
4939
__asm__ __volatile__ (
4940
//pre "movl row, %%edi \n\t"
4941
// get # of bytes to alignment
4945
"movl %%edi, %%ecx \n\t"
4946
"xorl %%ebx, %%ebx \n\t"
4947
"addl $0x7, %%ecx \n\t"
4948
"xorl %%eax, %%eax \n\t"
4949
"andl $0xfffffff8, %%ecx \n\t"
4950
//pre "movl prev_row, %%esi \n\t"
4951
"subl %%edi, %%ecx \n\t"
4954
"up_lp1: \n\t" // fix alignment
4955
"movb (%%edi,%%ebx,), %%al \n\t"
4956
"addb (%%esi,%%ebx,), %%al \n\t"
4958
"cmpl %%ecx, %%ebx \n\t"
4959
"movb %%al, -1(%%edi,%%ebx,) \n\t" // mov does not affect flags; -1 to
4960
"jb up_lp1 \n\t" // offset incl ebx
4963
//pre "movl len, %%edx \n\t"
4964
"movl %%edx, %%ecx \n\t"
4965
"subl %%ebx, %%edx \n\t" // subtract alignment fix
4966
"andl $0x0000003f, %%edx \n\t" // calc bytes over mult of 64
4967
"subl %%edx, %%ecx \n\t" // drop over bytes from length
4969
// unrolled loop - use all MMX registers and interleave to reduce
4970
// number of branch instructions (loops) and reduce partial stalls
4972
"movq (%%esi,%%ebx,), %%mm1 \n\t"
4973
"movq (%%edi,%%ebx,), %%mm0 \n\t"
4974
"movq 8(%%esi,%%ebx,), %%mm3 \n\t"
4975
"paddb %%mm1, %%mm0 \n\t"
4976
"movq 8(%%edi,%%ebx,), %%mm2 \n\t"
4977
"movq %%mm0, (%%edi,%%ebx,) \n\t"
4978
"paddb %%mm3, %%mm2 \n\t"
4979
"movq 16(%%esi,%%ebx,), %%mm5 \n\t"
4980
"movq %%mm2, 8(%%edi,%%ebx,) \n\t"
4981
"movq 16(%%edi,%%ebx,), %%mm4 \n\t"
4982
"movq 24(%%esi,%%ebx,), %%mm7 \n\t"
4983
"paddb %%mm5, %%mm4 \n\t"
4984
"movq 24(%%edi,%%ebx,), %%mm6 \n\t"
4985
"movq %%mm4, 16(%%edi,%%ebx,) \n\t"
4986
"paddb %%mm7, %%mm6 \n\t"
4987
"movq 32(%%esi,%%ebx,), %%mm1 \n\t"
4988
"movq %%mm6, 24(%%edi,%%ebx,) \n\t"
4989
"movq 32(%%edi,%%ebx,), %%mm0 \n\t"
4990
"movq 40(%%esi,%%ebx,), %%mm3 \n\t"
4991
"paddb %%mm1, %%mm0 \n\t"
4992
"movq 40(%%edi,%%ebx,), %%mm2 \n\t"
4993
"movq %%mm0, 32(%%edi,%%ebx,) \n\t"
4994
"paddb %%mm3, %%mm2 \n\t"
4995
"movq 48(%%esi,%%ebx,), %%mm5 \n\t"
4996
"movq %%mm2, 40(%%edi,%%ebx,) \n\t"
4997
"movq 48(%%edi,%%ebx,), %%mm4 \n\t"
4998
"movq 56(%%esi,%%ebx,), %%mm7 \n\t"
4999
"paddb %%mm5, %%mm4 \n\t"
5000
"movq 56(%%edi,%%ebx,), %%mm6 \n\t"
5001
"movq %%mm4, 48(%%edi,%%ebx,) \n\t"
5002
"addl $64, %%ebx \n\t"
5003
"paddb %%mm7, %%mm6 \n\t"
5004
"cmpl %%ecx, %%ebx \n\t"
5005
"movq %%mm6, -8(%%edi,%%ebx,) \n\t" // (+56)movq does not affect flags;
5006
"jb up_loop \n\t" // -8 to offset addl ebx
5008
"cmpl $0, %%edx \n\t" // test for bytes over mult of 64
5011
"cmpl $8, %%edx \n\t" // test for less than 8 bytes
5012
"jb up_lt8 \n\t" // [added by lcreeve at netins.net]
5014
"addl %%edx, %%ecx \n\t"
5015
"andl $0x00000007, %%edx \n\t" // calc bytes over mult of 8
5016
"subl %%edx, %%ecx \n\t" // drop over bytes from length
5019
"up_lpA: \n\t" // use MMX regs to update 8 bytes sim.
5020
"movq (%%esi,%%ebx,), %%mm1 \n\t"
5021
"movq (%%edi,%%ebx,), %%mm0 \n\t"
5022
"addl $8, %%ebx \n\t"
5023
"paddb %%mm1, %%mm0 \n\t"
5024
"cmpl %%ecx, %%ebx \n\t"
5025
"movq %%mm0, -8(%%edi,%%ebx,) \n\t" // movq does not affect flags; -8 to
5026
"jb up_lpA \n\t" // offset add ebx
5027
"cmpl $0, %%edx \n\t" // test for bytes over mult of 8
5031
"xorl %%eax, %%eax \n\t"
5032
"addl %%edx, %%ecx \n\t" // move over byte count into counter
5034
"up_lp2: \n\t" // use x86 regs for remaining bytes
5035
"movb (%%edi,%%ebx,), %%al \n\t"
5036
"addb (%%esi,%%ebx,), %%al \n\t"
5038
"cmpl %%ecx, %%ebx \n\t"
5039
"movb %%al, -1(%%edi,%%ebx,) \n\t" // mov does not affect flags; -1 to
5040
"jb up_lp2 \n\t" // offset inc ebx
5043
"EMMS \n\t" // conversion of filtered row complete
5048
: "=d" (dummy_value_d), // 0 // output regs (dummy)
5049
"=S" (dummy_value_S), // 1
5050
"=D" (dummy_value_D) // 2
5052
: "0" (len), // edx // input regs
5053
"1" (prev_row), // esi
5056
: "%eax", "%ecx" // clobber list (no input regs!)
5061
#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
5062
, "%mm0", "%mm1", "%mm2", "%mm3"
5063
, "%mm4", "%mm5", "%mm6", "%mm7"
5067
} // end of png_read_filter_row_mmx_up()
5069
#endif /* PNG_MMX_CODE_SUPPORTED */
5074
/*===========================================================================*/
5076
/* P N G _ R E A D _ F I L T E R _ R O W */
5078
/*===========================================================================*/
5081
/* Optimized png_read_filter_row routines */
5084
png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
5085
row, png_bytep prev_row, int filter)
5091
#if defined(PNG_MMX_CODE_SUPPORTED)
5092
/* GRR: these are superseded by png_ptr->asm_flags: */
5093
#define UseMMX_sub 1 // GRR: converted 20000730
5094
#define UseMMX_up 1 // GRR: converted 20000729
5095
#define UseMMX_avg 1 // GRR: converted 20000828 (+ 16-bit bugfix 20000916)
5096
#define UseMMX_paeth 1 // GRR: converted 20000828
5098
if (_mmx_supported == 2) {
5099
/* this should have happened in png_init_mmx_flags() already */
5100
#if !defined(PNG_1_0_X)
5101
png_warning(png_ptr, "asm_flags may not have been initialized");
5105
#endif /* PNG_MMX_CODE_SUPPORTED */
5108
png_debug(1, "in png_read_filter_row (pnggccrd.c)\n");
5111
case 0: sprintf(filnm, "none");
5113
case 1: sprintf(filnm, "sub-%s",
5114
#if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5115
#if !defined(PNG_1_0_X)
5116
(png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB)? "MMX" :
5121
case 2: sprintf(filnm, "up-%s",
5122
#ifdef PNG_MMX_CODE_SUPPORTED
5123
#if !defined(PNG_1_0_X)
5124
(png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP)? "MMX" :
5129
case 3: sprintf(filnm, "avg-%s",
5130
#if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5131
#if !defined(PNG_1_0_X)
5132
(png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG)? "MMX" :
5137
case 4: sprintf(filnm, "Paeth-%s",
5138
#if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5139
#if !defined(PNG_1_0_X)
5140
(png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH)? "MMX":
5145
default: sprintf(filnm, "unknw");
5148
png_debug2(0, "row_number=%5ld, %5s, ", png_ptr->row_number, filnm);
5149
png_debug1(0, "row=0x%08lx, ", (unsigned long)row);
5150
png_debug2(0, "pixdepth=%2d, bytes=%d, ", (int)row_info->pixel_depth,
5151
(int)((row_info->pixel_depth + 7) >> 3));
5152
png_debug1(0,"rowbytes=%8ld\n", row_info->rowbytes);
5153
#endif /* PNG_DEBUG */
5157
case PNG_FILTER_VALUE_NONE:
5160
case PNG_FILTER_VALUE_SUB:
5161
#if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5162
#if !defined(PNG_1_0_X)
5163
if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
5164
(row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5165
(row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5170
png_read_filter_row_mmx_sub(row_info, row);
5173
#endif /* PNG_MMX_CODE_SUPPORTED */
5176
png_uint_32 istop = row_info->rowbytes;
5177
png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
5178
png_bytep rp = row + bpp;
5181
for (i = bpp; i < istop; i++)
5183
*rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
5186
} /* end !UseMMX_sub */
5189
case PNG_FILTER_VALUE_UP:
5190
#if defined(PNG_MMX_CODE_SUPPORTED)
5191
#if !defined(PNG_1_0_X)
5192
if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
5193
(row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5194
(row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5199
png_read_filter_row_mmx_up(row_info, row, prev_row);
5202
#endif /* PNG_MMX_CODE_SUPPORTED */
5205
png_uint_32 istop = row_info->rowbytes;
5207
png_bytep pp = prev_row;
5209
for (i = 0; i < istop; ++i)
5211
*rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
5214
} /* end !UseMMX_up */
5217
case PNG_FILTER_VALUE_AVG:
5218
#if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5219
#if !defined(PNG_1_0_X)
5220
if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
5221
(row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5222
(row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5227
png_read_filter_row_mmx_avg(row_info, row, prev_row);
5230
#endif /* PNG_MMX_CODE_SUPPORTED */
5234
png_bytep pp = prev_row;
5236
png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
5237
png_uint_32 istop = row_info->rowbytes - bpp;
5239
for (i = 0; i < bpp; i++)
5241
*rp = (png_byte)(((int)(*rp) +
5242
((int)(*pp++) >> 1)) & 0xff);
5246
for (i = 0; i < istop; i++)
5248
*rp = (png_byte)(((int)(*rp) +
5249
((int)(*pp++ + *lp++) >> 1)) & 0xff);
5252
} /* end !UseMMX_avg */
5255
case PNG_FILTER_VALUE_PAETH:
5256
#if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5257
#if !defined(PNG_1_0_X)
5258
if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
5259
(row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5260
(row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5265
png_read_filter_row_mmx_paeth(row_info, row, prev_row);
5268
#endif /* PNG_MMX_CODE_SUPPORTED */
5272
png_bytep pp = prev_row;
5274
png_bytep cp = prev_row;
5275
png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
5276
png_uint_32 istop = row_info->rowbytes - bpp;
5278
for (i = 0; i < bpp; i++)
5280
*rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
5284
for (i = 0; i < istop; i++) /* use leftover rp,pp */
5286
int a, b, c, pa, pb, pc, p;
5300
pa = p < 0 ? -p : p;
5301
pb = pc < 0 ? -pc : pc;
5302
pc = (p + pc) < 0 ? -(p + pc) : p + pc;
5306
if (pa <= pb && pa <= pc)
5314
p = (pa <= pb && pa <= pc) ? a : (pb <= pc) ? b : c;
5316
*rp = (png_byte)(((int)(*rp) + p) & 0xff);
5319
} /* end !UseMMX_paeth */
5323
png_warning(png_ptr, "Ignoring bad row-filter type");
5329
#endif /* PNG_HAVE_MMX_READ_FILTER_ROW */
5332
/*===========================================================================*/
5334
/* P N G _ M M X _ S U P P O R T */
5336
/*===========================================================================*/
5338
/* GRR NOTES: (1) the following code assumes 386 or better (pushfl/popfl)
5339
* (2) all instructions compile with gcc 2.7.2.3 and later
5340
* (3) the function is moved down here to prevent gcc from
5341
* inlining it in multiple places and then barfing be-
5342
* cause the ".NOT_SUPPORTED" label is multiply defined
5343
* [is there a way to signal that a *single* function should
5344
* not be inlined? is there a way to modify the label for
5345
* each inlined instance, e.g., by appending _1, _2, etc.?
5346
* maybe if don't use leading "." in label name? (nope...sigh)]
5350
png_mmx_support(void)
5352
#if defined(PNG_MMX_CODE_SUPPORTED)
1
/* pnggccrd.c was removed from libpng-1.2.20. */
3
/* This code snippet is for use by configure's compilation test. */
5
#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && \
6
defined(PNG_MMX_CODE_SUPPORTED)
7
int PNGAPI png_dummy_mmx_support(void);
9
static int _mmx_supported = 2; // 0: no MMX; 1: MMX supported; 2: not tested
12
png_dummy_mmx_support(void) __attribute__((noinline));
15
png_dummy_mmx_support(void)
18
#if defined(PNG_MMX_CODE_SUPPORTED) // superfluous, but what the heck
5354
19
__asm__ __volatile__ (
20
#if defined(__x86_64__)
21
"pushq %%rbx \n\t" // rbx gets clobbered by CPUID instruction
22
"pushq %%rcx \n\t" // so does rcx...
23
"pushq %%rdx \n\t" // ...and rdx (but rcx & rdx safe on Linux)
24
"pushfq \n\t" // save Eflag to stack
25
"popq %%rax \n\t" // get Eflag from stack into rax
26
"movq %%rax, %%rcx \n\t" // make another copy of Eflag in rcx
27
"xorl $0x200000, %%eax \n\t" // toggle ID bit in Eflag (i.e., bit 21)
28
"pushq %%rax \n\t" // save modified Eflag back to stack
29
"popfq \n\t" // restore modified value to Eflag reg
30
"pushfq \n\t" // save Eflag to stack
31
"popq %%rax \n\t" // get Eflag from stack
32
"pushq %%rcx \n\t" // save original Eflag to stack
33
"popfq \n\t" // restore original Eflag
5355
35
"pushl %%ebx \n\t" // ebx gets clobbered by CPUID instruction
5356
36
"pushl %%ecx \n\t" // so does ecx...
5357
37
"pushl %%edx \n\t" // ...and edx (but ecx & edx safe on Linux)
5358
// ".byte 0x66 \n\t" // convert 16-bit pushf to 32-bit pushfd
5359
// "pushf \n\t" // 16-bit pushf
5360
38
"pushfl \n\t" // save Eflag to stack
5361
39
"popl %%eax \n\t" // get Eflag from stack into eax
5362
40
"movl %%eax, %%ecx \n\t" // make another copy of Eflag in ecx
5363
41
"xorl $0x200000, %%eax \n\t" // toggle ID bit in Eflag (i.e., bit 21)
5364
42
"pushl %%eax \n\t" // save modified Eflag back to stack
5365
// ".byte 0x66 \n\t" // convert 16-bit popf to 32-bit popfd
5366
// "popf \n\t" // 16-bit popf
5367
43
"popfl \n\t" // restore modified value to Eflag reg
5368
44
"pushfl \n\t" // save Eflag to stack
5369
45
"popl %%eax \n\t" // get Eflag from stack
5370
46
"pushl %%ecx \n\t" // save original Eflag to stack
5371
47
"popfl \n\t" // restore original Eflag
5372
49
"xorl %%ecx, %%eax \n\t" // compare new Eflag with original Eflag
5373
50
"jz 0f \n\t" // if same, CPUID instr. is not supported