1
/* pnggccrd.c - mixed C/assembler version of utilities to read a PNG file
3
* For Intel x86 CPU (Pentium-MMX or later) and GNU C compiler.
5
* See http://www.intel.com/drg/pentiumII/appnotes/916/916.htm
6
* and http://www.intel.com/drg/pentiumII/appnotes/923/923.htm
7
* for Intel's performance analysis of the MMX vs. non-MMX code.
9
* libpng version 1.2.8 - December 3, 2004
10
* For conditions of distribution and use, see copyright notice in png.h
11
* Copyright (c) 1998-2004 Glenn Randers-Pehrson
12
* Copyright (c) 1998, Intel Corporation
14
* Based on MSVC code contributed by Nirav Chhatrapati, Intel Corp., 1998.
15
* Interface to libpng contributed by Gilles Vollant, 1999.
16
* GNU C port by Greg Roelofs, 1999-2001.
18
* Lines 2350-4300 converted in place with intel2gas 1.3.1:
20
* intel2gas -mdI pnggccrd.c.partially-msvc -o pnggccrd.c
22
* and then cleaned up by hand. See http://hermes.terminal.at/intel2gas/ .
24
* NOTE: A sufficiently recent version of GNU as (or as.exe under DOS/Windows)
25
* is required to assemble the newer MMX instructions such as movq.
28
* ftp://ftp.simtel.net/pub/simtelnet/gnu/djgpp/v2gnu/bnu281b.zip
30
* (or a later version in the same directory). For Linux, check your
31
* distribution's web site(s) or try these links:
33
* http://rufus.w3.org/linux/RPM/binutils.html
34
* http://www.debian.org/Packages/stable/devel/binutils.html
35
* ftp://ftp.slackware.com/pub/linux/slackware/slackware/slakware/d1/
38
* For other platforms, see the main GNU site:
40
* ftp://ftp.gnu.org/pub/gnu/binutils/
42
* Version 2.5.2l.15 is definitely too old...
46
* TEMPORARY PORTING NOTES AND CHANGELOG (mostly by Greg Roelofs)
47
* =====================================
50
* - fixed sign error in post-MMX cleanup code (16- & 32-bit cases)
53
* - additional optimizations (possible or definite):
54
* x [DONE] write MMX code for 64-bit case (pixel_bytes == 8) [not tested]
55
* - write MMX code for 48-bit case (pixel_bytes == 6)
56
* - figure out what's up with 24-bit case (pixel_bytes == 3):
57
* why subtract 8 from width_mmx in the pass 4/5 case?
58
* (only width_mmx case) (near line 1606)
59
* x [DONE] replace pixel_bytes within each block with the true
60
* constant value (or are compilers smart enough to do that?)
61
* - rewrite all MMX interlacing code so it's aligned with
62
* the *beginning* of the row buffer, not the end. This
63
* would not only allow one to eliminate half of the memory
64
* writes for odd passes (that is, pass == odd), it may also
65
* eliminate some unaligned-data-access exceptions (assuming
66
* there's a penalty for not aligning 64-bit accesses on
67
* 64-bit boundaries). The only catch is that the "leftover"
68
* pixel(s) at the end of the row would have to be saved,
69
* but there are enough unused MMX registers in every case,
70
* so this is not a problem. A further benefit is that the
71
* post-MMX cleanup code (C code) in at least some of the
72
* cases could be done within the assembler block.
73
* x [DONE] the "v3 v2 v1 v0 v7 v6 v5 v4" comments are confusing,
74
* inconsistent, and don't match the MMX Programmer's Reference
75
* Manual conventions anyway. They should be changed to
76
* "b7 b6 b5 b4 b3 b2 b1 b0," where b0 indicates the byte that
77
* was lowest in memory (e.g., corresponding to a left pixel)
78
* and b7 is the byte that was highest (e.g., a right pixel).
81
* - Brennan's Guide notwithstanding, gcc under Linux does *not*
82
* want globals prefixed by underscores when referencing them--
83
* i.e., if the variable is const4, then refer to it as const4,
84
* not _const4. This seems to be a djgpp-specific requirement.
85
* Also, such variables apparently *must* be declared outside
86
* of functions; neither static nor automatic variables work if
87
* defined within the scope of a single function, but both
88
* static and truly global (multi-module) variables work fine.
91
* - fixed png_combine_row() non-MMX replication bug (odd passes only?)
92
* - switched from string-concatenation-with-macros to cleaner method of
93
* renaming global variables for djgpp--i.e., always use prefixes in
94
* inlined assembler code (== strings) and conditionally rename the
95
* variables, not the other way around. Hence _const4, _mask8_0, etc.
98
* - fixed mmxsupport()/png_do_read_interlace() first-row bug
99
* This one was severely weird: even though mmxsupport() doesn't touch
100
* ebx (where "row" pointer was stored), it nevertheless managed to zero
101
* the register (even in static/non-fPIC code--see below), which in turn
102
* caused png_do_read_interlace() to return prematurely on the first row of
103
* interlaced images (i.e., without expanding the interlaced pixels).
104
* Inspection of the generated assembly code didn't turn up any clues,
105
* although it did point at a minor optimization (i.e., get rid of
106
* mmx_supported_local variable and just use eax). Possibly the CPUID
107
* instruction is more destructive than it looks? (Not yet checked.)
108
* - "info gcc" was next to useless, so compared fPIC and non-fPIC assembly
109
* listings... Apparently register spillage has to do with ebx, since
110
* it's used to index the global offset table. Commenting it out of the
111
* input-reg lists in png_combine_row() eliminated compiler barfage, so
112
* ifdef'd with __PIC__ macro: if defined, use a global for unmask
115
* - verified CPUID clobberage: 12-char string constant ("GenuineIntel",
116
* "AuthenticAMD", etc.) placed in ebx:ecx:edx. Still need to polish.
119
* - made "diff" variable (now "_dif") global to simplify conversion of
120
* filtering routines (running out of regs, sigh). "diff" is still used
121
* in interlacing routines, however.
122
* - fixed up both versions of mmxsupport() (ORIG_THAT_USED_TO_CLOBBER_EBX
123
* macro determines which is used); original not yet tested.
126
* - when compiling with gcc, be sure to use -fomit-frame-pointer
129
* - fixed a register-name typo in png_do_read_interlace(), default (MMX) case,
130
* pass == 4 or 5, that caused visible corruption of interlaced images
133
* - Various problems were reported with gcc 2.95.2 in the Cygwin environment,
134
* many of the form "forbidden register 0 (ax) was spilled for class AREG."
135
* This is explained at http://gcc.gnu.org/fom_serv/cache/23.html, and
136
* Chuck Wilson supplied a patch involving dummy output registers. See
137
* http://sourceforge.net/bugs/?func=detailbug&bug_id=108741&group_id=5624
138
* for the original (anonymous) SourceForge bug report.
141
* - Chuck Wilson passed along these remaining gcc 2.95.2 errors:
142
* pnggccrd.c: In function `png_combine_row':
143
* pnggccrd.c:525: more than 10 operands in `asm'
144
* pnggccrd.c:669: more than 10 operands in `asm'
145
* pnggccrd.c:828: more than 10 operands in `asm'
146
* pnggccrd.c:994: more than 10 operands in `asm'
147
* pnggccrd.c:1177: more than 10 operands in `asm'
148
* They are all the same problem and can be worked around by using the
149
* global _unmask variable unconditionally, not just in the -fPIC case.
150
* Reportedly earlier versions of gcc also have the problem with more than
151
* 10 operands; they just don't report it. Much strangeness ensues, etc.
154
* - enabled png_read_filter_row_mmx_up() (shortest remaining unconverted
155
* MMX routine); began converting png_read_filter_row_mmx_sub()
156
* - to finish remaining sections:
157
* - clean up indentation and comments
158
* - preload local variables
159
* - add output and input regs (order of former determines numerical
161
* - avoid all usage of ebx (including bx, bh, bl) register [20000823]
162
* - remove "$" from addressing of Shift and Mask variables [20000823]
165
* - global union vars causing segfaults in png_read_filter_row_mmx_sub()?
168
* - ARGH, stupid png_read_filter_row_mmx_sub() segfault only happens with
169
* shared-library (-fPIC) version! Code works just fine as part of static
170
* library. Damn damn damn damn damn, should have tested that sooner.
171
* ebx is getting clobbered again (explicitly this time); need to save it
172
* on stack or rewrite asm code to avoid using it altogether. Blargh!
175
* - first section was trickiest; all remaining sections have ebx -> edx now.
176
* (-fPIC works again.) Also added missing underscores to various Shift*
177
* and *Mask* globals and got rid of leading "$" signs.
180
* - added visual separators to help navigate microscopic printed copies
181
* (http://pobox.com/~newt/code/gpr-latest.zip, mode 10); started working
182
* on png_read_filter_row_mmx_avg()
185
* - finished png_read_filter_row_mmx_avg(): only Paeth left! (930 lines...)
186
* What the hell, did png_read_filter_row_mmx_paeth(), too. Comments not
187
* cleaned up/shortened in either routine, but functionality is complete
188
* and seems to be working fine.
191
* - ahhh, figured out last(?) bit of gcc/gas asm-fu: if register is listed
192
* as an input reg (with dummy output variables, etc.), then it *cannot*
193
* also appear in the clobber list or gcc 2.95.2 will barf. The solution
194
* is simple enough...
197
* - bug in png_read_filter_row_mmx_avg(): 16-bit grayscale not handled
198
* correctly (but 48-bit RGB just fine)
201
* - fixed bug in png_read_filter_row_mmx_avg(), bpp == 2 case; three errors:
202
* - "_ShiftBpp.use = 24;" should have been "_ShiftBpp.use = 16;"
203
* - "_ShiftRem.use = 40;" should have been "_ShiftRem.use = 48;"
204
* - "psllq _ShiftRem, %%mm2" should have been "psrlq _ShiftRem, %%mm2"
207
* - added new png_init_mmx_flags() function (here only because it needs to
208
* call mmxsupport(), which should probably become global png_mmxsupport());
209
* modified other MMX routines to run conditionally (png_ptr->asm_flags)
212
* - renamed mmxsupport() to png_mmx_support(), with auto-set of mmx_supported,
213
* and made it public; moved png_init_mmx_flags() to png.c as internal func
216
* - removed dependency on png_read_filter_row_c() (C code already duplicated
217
* within MMX version of png_read_filter_row()) so no longer necessary to
218
* compile it into pngrutil.o
221
* - fixed buffer-overrun bug in png_combine_row() C code (non-MMX)
224
* - eliminated incorrect use of width_mmx in pixel_bytes == 8 case
227
* - more tinkering with clobber list at lines 4529 and 5033, to get
228
* it to compile on gcc-3.4.
231
* - test png_do_read_interlace() 64-bit case (pixel_bytes == 8)
232
* - write MMX code for 48-bit case (pixel_bytes == 6)
233
* - figure out what's up with 24-bit case (pixel_bytes == 3):
234
* why subtract 8 from width_mmx in the pass 4/5 case?
235
* (only width_mmx case) (near line 1606)
236
* - rewrite all MMX interlacing code so it's aligned with beginning
237
* of the row buffer, not the end (see 19991007 for details)
238
* x pick one version of mmxsupport() and get rid of the other
239
* - add error messages to any remaining bogus default cases
240
* - enable pixel_depth == 8 cases in png_read_filter_row()? (test speed)
241
* x add support for runtime enable/disable/query of various MMX routines
247
#if defined(PNG_USE_PNGGCCRD)
249
int PNGAPI png_mmx_support(void);
251
#ifdef PNG_USE_LOCAL_ARRAYS
252
static const int FARDATA png_pass_start[7] = {0, 4, 0, 2, 0, 1, 0};
253
static const int FARDATA png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
254
static const int FARDATA png_pass_width[7] = {8, 4, 4, 2, 2, 1, 1};
257
#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
258
/* djgpp, Win32, and Cygwin add their own underscores to global variables,
259
* so define them without: */
260
#if defined(__DJGPP__) || defined(WIN32) || defined(__CYGWIN__)
261
# define _mmx_supported mmx_supported
262
# define _const4 const4
263
# define _const6 const6
264
# define _mask8_0 mask8_0
265
# define _mask16_1 mask16_1
266
# define _mask16_0 mask16_0
267
# define _mask24_2 mask24_2
268
# define _mask24_1 mask24_1
269
# define _mask24_0 mask24_0
270
# define _mask32_3 mask32_3
271
# define _mask32_2 mask32_2
272
# define _mask32_1 mask32_1
273
# define _mask32_0 mask32_0
274
# define _mask48_5 mask48_5
275
# define _mask48_4 mask48_4
276
# define _mask48_3 mask48_3
277
# define _mask48_2 mask48_2
278
# define _mask48_1 mask48_1
279
# define _mask48_0 mask48_0
280
# define _LBCarryMask LBCarryMask
281
# define _HBClearMask HBClearMask
282
# define _ActiveMask ActiveMask
283
# define _ActiveMask2 ActiveMask2
284
# define _ActiveMaskEnd ActiveMaskEnd
285
# define _ShiftBpp ShiftBpp
286
# define _ShiftRem ShiftRem
287
#ifdef PNG_THREAD_UNSAFE_OK
288
# define _unmask unmask
289
# define _FullLength FullLength
290
# define _MMXLength MMXLength
292
# define _patemp patemp
293
# define _pbtemp pbtemp
294
# define _pctemp pctemp
299
/* These constants are used in the inlined MMX assembly code.
300
Ignore gcc's "At top level: defined but not used" warnings. */
302
/* GRR 20000706: originally _unmask was needed only when compiling with -fPIC,
303
* since that case uses the %ebx register for indexing the Global Offset Table
304
* and there were no other registers available. But gcc 2.95 and later emit
305
* "more than 10 operands in `asm'" errors when %ebx is used to preload unmask
306
* in the non-PIC case, so we'll just use the global unconditionally now.
308
#ifdef PNG_THREAD_UNSAFE_OK
312
static unsigned long long _mask8_0 = 0x0102040810204080LL;
314
static unsigned long long _mask16_1 = 0x0101020204040808LL;
315
static unsigned long long _mask16_0 = 0x1010202040408080LL;
317
static unsigned long long _mask24_2 = 0x0101010202020404LL;
318
static unsigned long long _mask24_1 = 0x0408080810101020LL;
319
static unsigned long long _mask24_0 = 0x2020404040808080LL;
321
static unsigned long long _mask32_3 = 0x0101010102020202LL;
322
static unsigned long long _mask32_2 = 0x0404040408080808LL;
323
static unsigned long long _mask32_1 = 0x1010101020202020LL;
324
static unsigned long long _mask32_0 = 0x4040404080808080LL;
326
static unsigned long long _mask48_5 = 0x0101010101010202LL;
327
static unsigned long long _mask48_4 = 0x0202020204040404LL;
328
static unsigned long long _mask48_3 = 0x0404080808080808LL;
329
static unsigned long long _mask48_2 = 0x1010101010102020LL;
330
static unsigned long long _mask48_1 = 0x2020202040404040LL;
331
static unsigned long long _mask48_0 = 0x4040808080808080LL;
333
static unsigned long long _const4 = 0x0000000000FFFFFFLL;
334
//static unsigned long long _const5 = 0x000000FFFFFF0000LL; // NOT USED
335
static unsigned long long _const6 = 0x00000000000000FFLL;
337
// These are used in the row-filter routines and should/would be local
338
// variables if not for gcc addressing limitations.
339
// WARNING: Their presence probably defeats the thread safety of libpng.
341
#ifdef PNG_THREAD_UNSAFE_OK
342
static png_uint_32 _FullLength;
343
static png_uint_32 _MMXLength;
345
static int _patemp; // temp variables for Paeth routine
351
png_squelch_warnings(void)
353
#ifdef PNG_THREAD_UNSAFE_OK
358
_MMXLength = _MMXLength;
363
_mask16_1 = _mask16_1;
364
_mask16_0 = _mask16_0;
365
_mask24_2 = _mask24_2;
366
_mask24_1 = _mask24_1;
367
_mask24_0 = _mask24_0;
368
_mask32_3 = _mask32_3;
369
_mask32_2 = _mask32_2;
370
_mask32_1 = _mask32_1;
371
_mask32_0 = _mask32_0;
372
_mask48_5 = _mask48_5;
373
_mask48_4 = _mask48_4;
374
_mask48_3 = _mask48_3;
375
_mask48_2 = _mask48_2;
376
_mask48_1 = _mask48_1;
377
_mask48_0 = _mask48_0;
379
#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
382
static int _mmx_supported = 2;
384
/*===========================================================================*/
386
/* P N G _ C O M B I N E _ R O W */
388
/*===========================================================================*/
390
#if defined(PNG_HAVE_ASSEMBLER_COMBINE_ROW)
393
#define BPP3 3 /* bytes per pixel (a.k.a. pixel_bytes) */
395
#define BPP6 6 /* (defined only to help avoid cut-and-paste errors) */
398
/* Combines the row recently read in with the previous row.
399
This routine takes care of alpha and transparency if requested.
400
This routine also handles the two methods of progressive display
401
of interlaced images, depending on the mask value.
402
The mask value describes which pixels are to be combined with
403
the row. The pattern always repeats every 8 pixels, so just 8
404
bits are needed. A one indicates the pixel is to be combined; a
405
zero indicates the pixel is to be skipped. This is in addition
406
to any alpha or transparency value associated with the pixel.
407
If you want all pixels to be combined, pass 0xff (255) in mask. */
409
/* Use this routine for the x86 platform - it uses a faster MMX routine
410
if the machine supports MMX. */
413
png_combine_row(png_structp png_ptr, png_bytep row, int mask)
415
png_debug(1, "in png_combine_row (pnggccrd.c)\n");
417
#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
418
if (_mmx_supported == 2) {
419
#if !defined(PNG_1_0_X)
420
/* this should have happened in png_init_mmx_flags() already */
421
png_warning(png_ptr, "asm_flags may not have been initialized");
429
png_debug(2,"mask == 0xff: doing single png_memcpy()\n");
430
png_memcpy(row, png_ptr->row_buf + 1,
431
(png_size_t)PNG_ROWBYTES(png_ptr->row_info.pixel_depth,png_ptr->width));
433
else /* (png_combine_row() is never called with mask == 0) */
435
switch (png_ptr->row_info.pixel_depth)
437
case 1: /* png_ptr->row_info.pixel_depth */
441
int s_inc, s_start, s_end;
446
sp = png_ptr->row_buf + 1;
449
#if defined(PNG_READ_PACKSWAP_SUPPORTED)
450
if (png_ptr->transformations & PNG_PACKSWAP)
466
for (i = 0; i < png_ptr->width; i++)
472
value = (*sp >> shift) & 0x1;
473
*dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
474
*dp |= (png_byte)(value << shift);
494
case 2: /* png_ptr->row_info.pixel_depth */
498
int s_start, s_end, s_inc;
504
sp = png_ptr->row_buf + 1;
507
#if defined(PNG_READ_PACKSWAP_SUPPORTED)
508
if (png_ptr->transformations & PNG_PACKSWAP)
524
for (i = 0; i < png_ptr->width; i++)
528
value = (*sp >> shift) & 0x3;
529
*dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
530
*dp |= (png_byte)(value << shift);
549
case 4: /* png_ptr->row_info.pixel_depth */
553
int s_start, s_end, s_inc;
559
sp = png_ptr->row_buf + 1;
562
#if defined(PNG_READ_PACKSWAP_SUPPORTED)
563
if (png_ptr->transformations & PNG_PACKSWAP)
578
for (i = 0; i < png_ptr->width; i++)
582
value = (*sp >> shift) & 0xf;
583
*dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
584
*dp |= (png_byte)(value << shift);
603
case 8: /* png_ptr->row_info.pixel_depth */
608
#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
609
#if !defined(PNG_1_0_X)
610
if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
611
/* && _mmx_supported */ )
618
int dummy_value_a; // fix 'forbidden register spilled' error
623
_unmask = ~mask; // global variable for -fPIC version
624
srcptr = png_ptr->row_buf + 1;
626
len = png_ptr->width &~7; // reduce to multiple of 8
627
diff = (int) (png_ptr->width & 7); // amount lost
629
__asm__ __volatile__ (
630
"movd _unmask, %%mm7 \n\t" // load bit pattern
631
"psubb %%mm6, %%mm6 \n\t" // zero mm6
632
"punpcklbw %%mm7, %%mm7 \n\t"
633
"punpcklwd %%mm7, %%mm7 \n\t"
634
"punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
636
"movq _mask8_0, %%mm0 \n\t"
637
"pand %%mm7, %%mm0 \n\t" // nonzero if keep byte
638
"pcmpeqb %%mm6, %%mm0 \n\t" // zeros->1s, v versa
640
// preload "movl len, %%ecx \n\t" // load length of line
641
// preload "movl srcptr, %%esi \n\t" // load source
642
// preload "movl dstptr, %%edi \n\t" // load dest
644
"cmpl $0, %%ecx \n\t" // len == 0 ?
645
"je mainloop8end \n\t"
648
"movq (%%esi), %%mm4 \n\t" // *srcptr
649
"pand %%mm0, %%mm4 \n\t"
650
"movq %%mm0, %%mm6 \n\t"
651
"pandn (%%edi), %%mm6 \n\t" // *dstptr
652
"por %%mm6, %%mm4 \n\t"
653
"movq %%mm4, (%%edi) \n\t"
654
"addl $8, %%esi \n\t" // inc by 8 bytes processed
655
"addl $8, %%edi \n\t"
656
"subl $8, %%ecx \n\t" // dec by 8 pixels processed
660
// preload "movl diff, %%ecx \n\t" // (diff is in eax)
661
"movl %%eax, %%ecx \n\t"
662
"cmpl $0, %%ecx \n\t"
664
// preload "movl mask, %%edx \n\t"
665
"sall $24, %%edx \n\t" // make low byte, high byte
668
"sall %%edx \n\t" // move high bit to CF
669
"jnc skip8 \n\t" // if CF = 0
670
"movb (%%esi), %%al \n\t"
671
"movb %%al, (%%edi) \n\t"
677
"jnz secondloop8 \n\t"
682
: "=a" (dummy_value_a), // output regs (dummy)
683
"=d" (dummy_value_d),
684
"=c" (dummy_value_c),
685
"=S" (dummy_value_S),
688
: "3" (srcptr), // esi // input regs
691
// was (unmask) "b" RESERVED // ebx // Global Offset Table idx
695
#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
696
: "%mm0", "%mm4", "%mm6", "%mm7" // clobber list
700
else /* mmx _not supported - Use modified C routine */
701
#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
703
register png_uint_32 i;
704
png_uint_32 initial_val = png_pass_start[png_ptr->pass];
705
/* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
706
register int stride = png_pass_inc[png_ptr->pass];
707
/* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
708
register int rep_bytes = png_pass_width[png_ptr->pass];
709
/* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
710
png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
711
int diff = (int) (png_ptr->width & 7); /* amount lost */
712
register png_uint_32 final_val = len; /* GRR bugfix */
714
srcptr = png_ptr->row_buf + 1 + initial_val;
715
dstptr = row + initial_val;
717
for (i = initial_val; i < final_val; i += stride)
719
png_memcpy(dstptr, srcptr, rep_bytes);
723
if (diff) /* number of leftover pixels: 3 for pngtest */
725
final_val+=diff /* *BPP1 */ ;
726
for (; i < final_val; i += stride)
728
if (rep_bytes > (int)(final_val-i))
729
rep_bytes = (int)(final_val-i);
730
png_memcpy(dstptr, srcptr, rep_bytes);
736
} /* end of else (_mmx_supported) */
741
case 16: /* png_ptr->row_info.pixel_depth */
746
#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
747
#if !defined(PNG_1_0_X)
748
if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
749
/* && _mmx_supported */ )
756
int dummy_value_a; // fix 'forbidden register spilled' error
761
_unmask = ~mask; // global variable for -fPIC version
762
srcptr = png_ptr->row_buf + 1;
764
len = png_ptr->width &~7; // reduce to multiple of 8
765
diff = (int) (png_ptr->width & 7); // amount lost //
767
__asm__ __volatile__ (
768
"movd _unmask, %%mm7 \n\t" // load bit pattern
769
"psubb %%mm6, %%mm6 \n\t" // zero mm6
770
"punpcklbw %%mm7, %%mm7 \n\t"
771
"punpcklwd %%mm7, %%mm7 \n\t"
772
"punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
774
"movq _mask16_0, %%mm0 \n\t"
775
"movq _mask16_1, %%mm1 \n\t"
777
"pand %%mm7, %%mm0 \n\t"
778
"pand %%mm7, %%mm1 \n\t"
780
"pcmpeqb %%mm6, %%mm0 \n\t"
781
"pcmpeqb %%mm6, %%mm1 \n\t"
783
// preload "movl len, %%ecx \n\t" // load length of line
784
// preload "movl srcptr, %%esi \n\t" // load source
785
// preload "movl dstptr, %%edi \n\t" // load dest
787
"cmpl $0, %%ecx \n\t"
788
"jz mainloop16end \n\t"
791
"movq (%%esi), %%mm4 \n\t"
792
"pand %%mm0, %%mm4 \n\t"
793
"movq %%mm0, %%mm6 \n\t"
794
"movq (%%edi), %%mm7 \n\t"
795
"pandn %%mm7, %%mm6 \n\t"
796
"por %%mm6, %%mm4 \n\t"
797
"movq %%mm4, (%%edi) \n\t"
799
"movq 8(%%esi), %%mm5 \n\t"
800
"pand %%mm1, %%mm5 \n\t"
801
"movq %%mm1, %%mm7 \n\t"
802
"movq 8(%%edi), %%mm6 \n\t"
803
"pandn %%mm6, %%mm7 \n\t"
804
"por %%mm7, %%mm5 \n\t"
805
"movq %%mm5, 8(%%edi) \n\t"
807
"addl $16, %%esi \n\t" // inc by 16 bytes processed
808
"addl $16, %%edi \n\t"
809
"subl $8, %%ecx \n\t" // dec by 8 pixels processed
812
"mainloop16end: \n\t"
813
// preload "movl diff, %%ecx \n\t" // (diff is in eax)
814
"movl %%eax, %%ecx \n\t"
815
"cmpl $0, %%ecx \n\t"
817
// preload "movl mask, %%edx \n\t"
818
"sall $24, %%edx \n\t" // make low byte, high byte
821
"sall %%edx \n\t" // move high bit to CF
822
"jnc skip16 \n\t" // if CF = 0
823
"movw (%%esi), %%ax \n\t"
824
"movw %%ax, (%%edi) \n\t"
827
"addl $2, %%esi \n\t"
828
"addl $2, %%edi \n\t"
830
"jnz secondloop16 \n\t"
835
: "=a" (dummy_value_a), // output regs (dummy)
836
"=c" (dummy_value_c),
837
"=d" (dummy_value_d),
838
"=S" (dummy_value_S),
841
: "0" (diff), // eax // input regs
842
// was (unmask) " " RESERVED // ebx // Global Offset Table idx
848
#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
849
: "%mm0", "%mm1", "%mm4" // clobber list
850
, "%mm5", "%mm6", "%mm7"
854
else /* mmx _not supported - Use modified C routine */
855
#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
857
register png_uint_32 i;
858
png_uint_32 initial_val = BPP2 * png_pass_start[png_ptr->pass];
859
/* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
860
register int stride = BPP2 * png_pass_inc[png_ptr->pass];
861
/* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
862
register int rep_bytes = BPP2 * png_pass_width[png_ptr->pass];
863
/* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
864
png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
865
int diff = (int) (png_ptr->width & 7); /* amount lost */
866
register png_uint_32 final_val = BPP2 * len; /* GRR bugfix */
868
srcptr = png_ptr->row_buf + 1 + initial_val;
869
dstptr = row + initial_val;
871
for (i = initial_val; i < final_val; i += stride)
873
png_memcpy(dstptr, srcptr, rep_bytes);
877
if (diff) /* number of leftover pixels: 3 for pngtest */
879
final_val+=diff*BPP2;
880
for (; i < final_val; i += stride)
882
if (rep_bytes > (int)(final_val-i))
883
rep_bytes = (int)(final_val-i);
884
png_memcpy(dstptr, srcptr, rep_bytes);
889
} /* end of else (_mmx_supported) */
894
case 24: /* png_ptr->row_info.pixel_depth */
899
#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
900
#if !defined(PNG_1_0_X)
901
if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
902
/* && _mmx_supported */ )
909
int dummy_value_a; // fix 'forbidden register spilled' error
914
_unmask = ~mask; // global variable for -fPIC version
915
srcptr = png_ptr->row_buf + 1;
917
len = png_ptr->width &~7; // reduce to multiple of 8
918
diff = (int) (png_ptr->width & 7); // amount lost //
920
__asm__ __volatile__ (
921
"movd _unmask, %%mm7 \n\t" // load bit pattern
922
"psubb %%mm6, %%mm6 \n\t" // zero mm6
923
"punpcklbw %%mm7, %%mm7 \n\t"
924
"punpcklwd %%mm7, %%mm7 \n\t"
925
"punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
927
"movq _mask24_0, %%mm0 \n\t"
928
"movq _mask24_1, %%mm1 \n\t"
929
"movq _mask24_2, %%mm2 \n\t"
931
"pand %%mm7, %%mm0 \n\t"
932
"pand %%mm7, %%mm1 \n\t"
933
"pand %%mm7, %%mm2 \n\t"
935
"pcmpeqb %%mm6, %%mm0 \n\t"
936
"pcmpeqb %%mm6, %%mm1 \n\t"
937
"pcmpeqb %%mm6, %%mm2 \n\t"
939
// preload "movl len, %%ecx \n\t" // load length of line
940
// preload "movl srcptr, %%esi \n\t" // load source
941
// preload "movl dstptr, %%edi \n\t" // load dest
943
"cmpl $0, %%ecx \n\t"
944
"jz mainloop24end \n\t"
947
"movq (%%esi), %%mm4 \n\t"
948
"pand %%mm0, %%mm4 \n\t"
949
"movq %%mm0, %%mm6 \n\t"
950
"movq (%%edi), %%mm7 \n\t"
951
"pandn %%mm7, %%mm6 \n\t"
952
"por %%mm6, %%mm4 \n\t"
953
"movq %%mm4, (%%edi) \n\t"
955
"movq 8(%%esi), %%mm5 \n\t"
956
"pand %%mm1, %%mm5 \n\t"
957
"movq %%mm1, %%mm7 \n\t"
958
"movq 8(%%edi), %%mm6 \n\t"
959
"pandn %%mm6, %%mm7 \n\t"
960
"por %%mm7, %%mm5 \n\t"
961
"movq %%mm5, 8(%%edi) \n\t"
963
"movq 16(%%esi), %%mm6 \n\t"
964
"pand %%mm2, %%mm6 \n\t"
965
"movq %%mm2, %%mm4 \n\t"
966
"movq 16(%%edi), %%mm7 \n\t"
967
"pandn %%mm7, %%mm4 \n\t"
968
"por %%mm4, %%mm6 \n\t"
969
"movq %%mm6, 16(%%edi) \n\t"
971
"addl $24, %%esi \n\t" // inc by 24 bytes processed
972
"addl $24, %%edi \n\t"
973
"subl $8, %%ecx \n\t" // dec by 8 pixels processed
977
"mainloop24end: \n\t"
978
// preload "movl diff, %%ecx \n\t" // (diff is in eax)
979
"movl %%eax, %%ecx \n\t"
980
"cmpl $0, %%ecx \n\t"
982
// preload "movl mask, %%edx \n\t"
983
"sall $24, %%edx \n\t" // make low byte, high byte
986
"sall %%edx \n\t" // move high bit to CF
987
"jnc skip24 \n\t" // if CF = 0
988
"movw (%%esi), %%ax \n\t"
989
"movw %%ax, (%%edi) \n\t"
990
"xorl %%eax, %%eax \n\t"
991
"movb 2(%%esi), %%al \n\t"
992
"movb %%al, 2(%%edi) \n\t"
995
"addl $3, %%esi \n\t"
996
"addl $3, %%edi \n\t"
998
"jnz secondloop24 \n\t"
1003
: "=a" (dummy_value_a), // output regs (dummy)
1004
"=d" (dummy_value_d),
1005
"=c" (dummy_value_c),
1006
"=S" (dummy_value_S),
1007
"=D" (dummy_value_D)
1009
: "3" (srcptr), // esi // input regs
1010
"4" (dstptr), // edi
1012
// was (unmask) "b" RESERVED // ebx // Global Offset Table idx
1016
#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
1017
: "%mm0", "%mm1", "%mm2" // clobber list
1018
, "%mm4", "%mm5", "%mm6", "%mm7"
1022
else /* mmx _not supported - Use modified C routine */
1023
#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
1025
register png_uint_32 i;
1026
png_uint_32 initial_val = BPP3 * png_pass_start[png_ptr->pass];
1027
/* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1028
register int stride = BPP3 * png_pass_inc[png_ptr->pass];
1029
/* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1030
register int rep_bytes = BPP3 * png_pass_width[png_ptr->pass];
1031
/* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1032
png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
1033
int diff = (int) (png_ptr->width & 7); /* amount lost */
1034
register png_uint_32 final_val = BPP3 * len; /* GRR bugfix */
1036
srcptr = png_ptr->row_buf + 1 + initial_val;
1037
dstptr = row + initial_val;
1039
for (i = initial_val; i < final_val; i += stride)
1041
png_memcpy(dstptr, srcptr, rep_bytes);
1045
if (diff) /* number of leftover pixels: 3 for pngtest */
1047
final_val+=diff*BPP3;
1048
for (; i < final_val; i += stride)
1050
if (rep_bytes > (int)(final_val-i))
1051
rep_bytes = (int)(final_val-i);
1052
png_memcpy(dstptr, srcptr, rep_bytes);
1057
} /* end of else (_mmx_supported) */
1062
case 32: /* png_ptr->row_info.pixel_depth */
1067
#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
1068
#if !defined(PNG_1_0_X)
1069
if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
1070
/* && _mmx_supported */ )
1077
int dummy_value_a; // fix 'forbidden register spilled' error
1082
_unmask = ~mask; // global variable for -fPIC version
1083
srcptr = png_ptr->row_buf + 1;
1085
len = png_ptr->width &~7; // reduce to multiple of 8
1086
diff = (int) (png_ptr->width & 7); // amount lost //
1088
__asm__ __volatile__ (
1089
"movd _unmask, %%mm7 \n\t" // load bit pattern
1090
"psubb %%mm6, %%mm6 \n\t" // zero mm6
1091
"punpcklbw %%mm7, %%mm7 \n\t"
1092
"punpcklwd %%mm7, %%mm7 \n\t"
1093
"punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
1095
"movq _mask32_0, %%mm0 \n\t"
1096
"movq _mask32_1, %%mm1 \n\t"
1097
"movq _mask32_2, %%mm2 \n\t"
1098
"movq _mask32_3, %%mm3 \n\t"
1100
"pand %%mm7, %%mm0 \n\t"
1101
"pand %%mm7, %%mm1 \n\t"
1102
"pand %%mm7, %%mm2 \n\t"
1103
"pand %%mm7, %%mm3 \n\t"
1105
"pcmpeqb %%mm6, %%mm0 \n\t"
1106
"pcmpeqb %%mm6, %%mm1 \n\t"
1107
"pcmpeqb %%mm6, %%mm2 \n\t"
1108
"pcmpeqb %%mm6, %%mm3 \n\t"
1110
// preload "movl len, %%ecx \n\t" // load length of line
1111
// preload "movl srcptr, %%esi \n\t" // load source
1112
// preload "movl dstptr, %%edi \n\t" // load dest
1114
"cmpl $0, %%ecx \n\t" // lcr
1115
"jz mainloop32end \n\t"
1118
"movq (%%esi), %%mm4 \n\t"
1119
"pand %%mm0, %%mm4 \n\t"
1120
"movq %%mm0, %%mm6 \n\t"
1121
"movq (%%edi), %%mm7 \n\t"
1122
"pandn %%mm7, %%mm6 \n\t"
1123
"por %%mm6, %%mm4 \n\t"
1124
"movq %%mm4, (%%edi) \n\t"
1126
"movq 8(%%esi), %%mm5 \n\t"
1127
"pand %%mm1, %%mm5 \n\t"
1128
"movq %%mm1, %%mm7 \n\t"
1129
"movq 8(%%edi), %%mm6 \n\t"
1130
"pandn %%mm6, %%mm7 \n\t"
1131
"por %%mm7, %%mm5 \n\t"
1132
"movq %%mm5, 8(%%edi) \n\t"
1134
"movq 16(%%esi), %%mm6 \n\t"
1135
"pand %%mm2, %%mm6 \n\t"
1136
"movq %%mm2, %%mm4 \n\t"
1137
"movq 16(%%edi), %%mm7 \n\t"
1138
"pandn %%mm7, %%mm4 \n\t"
1139
"por %%mm4, %%mm6 \n\t"
1140
"movq %%mm6, 16(%%edi) \n\t"
1142
"movq 24(%%esi), %%mm7 \n\t"
1143
"pand %%mm3, %%mm7 \n\t"
1144
"movq %%mm3, %%mm5 \n\t"
1145
"movq 24(%%edi), %%mm4 \n\t"
1146
"pandn %%mm4, %%mm5 \n\t"
1147
"por %%mm5, %%mm7 \n\t"
1148
"movq %%mm7, 24(%%edi) \n\t"
1150
"addl $32, %%esi \n\t" // inc by 32 bytes processed
1151
"addl $32, %%edi \n\t"
1152
"subl $8, %%ecx \n\t" // dec by 8 pixels processed
1153
"ja mainloop32 \n\t"
1155
"mainloop32end: \n\t"
1156
// preload "movl diff, %%ecx \n\t" // (diff is in eax)
1157
"movl %%eax, %%ecx \n\t"
1158
"cmpl $0, %%ecx \n\t"
1160
// preload "movl mask, %%edx \n\t"
1161
"sall $24, %%edx \n\t" // low byte => high byte
1163
"secondloop32: \n\t"
1164
"sall %%edx \n\t" // move high bit to CF
1165
"jnc skip32 \n\t" // if CF = 0
1166
"movl (%%esi), %%eax \n\t"
1167
"movl %%eax, (%%edi) \n\t"
1170
"addl $4, %%esi \n\t"
1171
"addl $4, %%edi \n\t"
1173
"jnz secondloop32 \n\t"
1178
: "=a" (dummy_value_a), // output regs (dummy)
1179
"=d" (dummy_value_d),
1180
"=c" (dummy_value_c),
1181
"=S" (dummy_value_S),
1182
"=D" (dummy_value_D)
1184
: "3" (srcptr), // esi // input regs
1185
"4" (dstptr), // edi
1187
// was (unmask) "b" RESERVED // ebx // Global Offset Table idx
1191
#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
1192
: "%mm0", "%mm1", "%mm2", "%mm3" // clobber list
1193
, "%mm4", "%mm5", "%mm6", "%mm7"
1197
else /* mmx _not supported - Use modified C routine */
1198
#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
1200
register png_uint_32 i;
1201
png_uint_32 initial_val = BPP4 * png_pass_start[png_ptr->pass];
1202
/* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1203
register int stride = BPP4 * png_pass_inc[png_ptr->pass];
1204
/* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1205
register int rep_bytes = BPP4 * png_pass_width[png_ptr->pass];
1206
/* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1207
png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
1208
int diff = (int) (png_ptr->width & 7); /* amount lost */
1209
register png_uint_32 final_val = BPP4 * len; /* GRR bugfix */
1211
srcptr = png_ptr->row_buf + 1 + initial_val;
1212
dstptr = row + initial_val;
1214
for (i = initial_val; i < final_val; i += stride)
1216
png_memcpy(dstptr, srcptr, rep_bytes);
1220
if (diff) /* number of leftover pixels: 3 for pngtest */
1222
final_val+=diff*BPP4;
1223
for (; i < final_val; i += stride)
1225
if (rep_bytes > (int)(final_val-i))
1226
rep_bytes = (int)(final_val-i);
1227
png_memcpy(dstptr, srcptr, rep_bytes);
1232
} /* end of else (_mmx_supported) */
1237
case 48: /* png_ptr->row_info.pixel_depth */
1242
#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
1243
#if !defined(PNG_1_0_X)
1244
if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
1245
/* && _mmx_supported */ )
1252
int dummy_value_a; // fix 'forbidden register spilled' error
1257
_unmask = ~mask; // global variable for -fPIC version
1258
srcptr = png_ptr->row_buf + 1;
1260
len = png_ptr->width &~7; // reduce to multiple of 8
1261
diff = (int) (png_ptr->width & 7); // amount lost //
1263
__asm__ __volatile__ (
1264
"movd _unmask, %%mm7 \n\t" // load bit pattern
1265
"psubb %%mm6, %%mm6 \n\t" // zero mm6
1266
"punpcklbw %%mm7, %%mm7 \n\t"
1267
"punpcklwd %%mm7, %%mm7 \n\t"
1268
"punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
1270
"movq _mask48_0, %%mm0 \n\t"
1271
"movq _mask48_1, %%mm1 \n\t"
1272
"movq _mask48_2, %%mm2 \n\t"
1273
"movq _mask48_3, %%mm3 \n\t"
1274
"movq _mask48_4, %%mm4 \n\t"
1275
"movq _mask48_5, %%mm5 \n\t"
1277
"pand %%mm7, %%mm0 \n\t"
1278
"pand %%mm7, %%mm1 \n\t"
1279
"pand %%mm7, %%mm2 \n\t"
1280
"pand %%mm7, %%mm3 \n\t"
1281
"pand %%mm7, %%mm4 \n\t"
1282
"pand %%mm7, %%mm5 \n\t"
1284
"pcmpeqb %%mm6, %%mm0 \n\t"
1285
"pcmpeqb %%mm6, %%mm1 \n\t"
1286
"pcmpeqb %%mm6, %%mm2 \n\t"
1287
"pcmpeqb %%mm6, %%mm3 \n\t"
1288
"pcmpeqb %%mm6, %%mm4 \n\t"
1289
"pcmpeqb %%mm6, %%mm5 \n\t"
1291
// preload "movl len, %%ecx \n\t" // load length of line
1292
// preload "movl srcptr, %%esi \n\t" // load source
1293
// preload "movl dstptr, %%edi \n\t" // load dest
1295
"cmpl $0, %%ecx \n\t"
1296
"jz mainloop48end \n\t"
1299
"movq (%%esi), %%mm7 \n\t"
1300
"pand %%mm0, %%mm7 \n\t"
1301
"movq %%mm0, %%mm6 \n\t"
1302
"pandn (%%edi), %%mm6 \n\t"
1303
"por %%mm6, %%mm7 \n\t"
1304
"movq %%mm7, (%%edi) \n\t"
1306
"movq 8(%%esi), %%mm6 \n\t"
1307
"pand %%mm1, %%mm6 \n\t"
1308
"movq %%mm1, %%mm7 \n\t"
1309
"pandn 8(%%edi), %%mm7 \n\t"
1310
"por %%mm7, %%mm6 \n\t"
1311
"movq %%mm6, 8(%%edi) \n\t"
1313
"movq 16(%%esi), %%mm6 \n\t"
1314
"pand %%mm2, %%mm6 \n\t"
1315
"movq %%mm2, %%mm7 \n\t"
1316
"pandn 16(%%edi), %%mm7 \n\t"
1317
"por %%mm7, %%mm6 \n\t"
1318
"movq %%mm6, 16(%%edi) \n\t"
1320
"movq 24(%%esi), %%mm7 \n\t"
1321
"pand %%mm3, %%mm7 \n\t"
1322
"movq %%mm3, %%mm6 \n\t"
1323
"pandn 24(%%edi), %%mm6 \n\t"
1324
"por %%mm6, %%mm7 \n\t"
1325
"movq %%mm7, 24(%%edi) \n\t"
1327
"movq 32(%%esi), %%mm6 \n\t"
1328
"pand %%mm4, %%mm6 \n\t"
1329
"movq %%mm4, %%mm7 \n\t"
1330
"pandn 32(%%edi), %%mm7 \n\t"
1331
"por %%mm7, %%mm6 \n\t"
1332
"movq %%mm6, 32(%%edi) \n\t"
1334
"movq 40(%%esi), %%mm7 \n\t"
1335
"pand %%mm5, %%mm7 \n\t"
1336
"movq %%mm5, %%mm6 \n\t"
1337
"pandn 40(%%edi), %%mm6 \n\t"
1338
"por %%mm6, %%mm7 \n\t"
1339
"movq %%mm7, 40(%%edi) \n\t"
1341
"addl $48, %%esi \n\t" // inc by 48 bytes processed
1342
"addl $48, %%edi \n\t"
1343
"subl $8, %%ecx \n\t" // dec by 8 pixels processed
1345
"ja mainloop48 \n\t"
1347
"mainloop48end: \n\t"
1348
// preload "movl diff, %%ecx \n\t" // (diff is in eax)
1349
"movl %%eax, %%ecx \n\t"
1350
"cmpl $0, %%ecx \n\t"
1352
// preload "movl mask, %%edx \n\t"
1353
"sall $24, %%edx \n\t" // make low byte, high byte
1355
"secondloop48: \n\t"
1356
"sall %%edx \n\t" // move high bit to CF
1357
"jnc skip48 \n\t" // if CF = 0
1358
"movl (%%esi), %%eax \n\t"
1359
"movl %%eax, (%%edi) \n\t"
1362
"addl $4, %%esi \n\t"
1363
"addl $4, %%edi \n\t"
1365
"jnz secondloop48 \n\t"
1370
: "=a" (dummy_value_a), // output regs (dummy)
1371
"=d" (dummy_value_d),
1372
"=c" (dummy_value_c),
1373
"=S" (dummy_value_S),
1374
"=D" (dummy_value_D)
1376
: "3" (srcptr), // esi // input regs
1377
"4" (dstptr), // edi
1379
// was (unmask) "b" RESERVED // ebx // Global Offset Table idx
1383
#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
1384
: "%mm0", "%mm1", "%mm2", "%mm3" // clobber list
1385
, "%mm4", "%mm5", "%mm6", "%mm7"
1389
else /* mmx _not supported - Use modified C routine */
1390
#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
1392
register png_uint_32 i;
1393
png_uint_32 initial_val = BPP6 * png_pass_start[png_ptr->pass];
1394
/* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1395
register int stride = BPP6 * png_pass_inc[png_ptr->pass];
1396
/* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1397
register int rep_bytes = BPP6 * png_pass_width[png_ptr->pass];
1398
/* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1399
png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
1400
int diff = (int) (png_ptr->width & 7); /* amount lost */
1401
register png_uint_32 final_val = BPP6 * len; /* GRR bugfix */
1403
srcptr = png_ptr->row_buf + 1 + initial_val;
1404
dstptr = row + initial_val;
1406
for (i = initial_val; i < final_val; i += stride)
1408
png_memcpy(dstptr, srcptr, rep_bytes);
1412
if (diff) /* number of leftover pixels: 3 for pngtest */
1414
final_val+=diff*BPP6;
1415
for (; i < final_val; i += stride)
1417
if (rep_bytes > (int)(final_val-i))
1418
rep_bytes = (int)(final_val-i);
1419
png_memcpy(dstptr, srcptr, rep_bytes);
1424
} /* end of else (_mmx_supported) */
1429
case 64: /* png_ptr->row_info.pixel_depth */
1433
register png_uint_32 i;
1434
png_uint_32 initial_val = BPP8 * png_pass_start[png_ptr->pass];
1435
/* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1436
register int stride = BPP8 * png_pass_inc[png_ptr->pass];
1437
/* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1438
register int rep_bytes = BPP8 * png_pass_width[png_ptr->pass];
1439
/* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1440
png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
1441
int diff = (int) (png_ptr->width & 7); /* amount lost */
1442
register png_uint_32 final_val = BPP8 * len; /* GRR bugfix */
1444
srcptr = png_ptr->row_buf + 1 + initial_val;
1445
dstptr = row + initial_val;
1447
for (i = initial_val; i < final_val; i += stride)
1449
png_memcpy(dstptr, srcptr, rep_bytes);
1453
if (diff) /* number of leftover pixels: 3 for pngtest */
1455
final_val+=diff*BPP8;
1456
for (; i < final_val; i += stride)
1458
if (rep_bytes > (int)(final_val-i))
1459
rep_bytes = (int)(final_val-i);
1460
png_memcpy(dstptr, srcptr, rep_bytes);
1469
default: /* png_ptr->row_info.pixel_depth != 1,2,4,8,16,24,32,48,64 */
1471
/* this should never happen */
1472
png_warning(png_ptr, "Invalid row_info.pixel_depth in pnggccrd");
1475
} /* end switch (png_ptr->row_info.pixel_depth) */
1477
} /* end if (non-trivial mask) */
1479
} /* end png_combine_row() */
1481
#endif /* PNG_HAVE_ASSEMBLER_COMBINE_ROW */
1486
/*===========================================================================*/
1488
/* P N G _ D O _ R E A D _ I N T E R L A C E */
1490
/*===========================================================================*/
1492
#if defined(PNG_READ_INTERLACING_SUPPORTED)
1493
#if defined(PNG_HAVE_ASSEMBLER_READ_INTERLACE)
1495
/* png_do_read_interlace() is called after any 16-bit to 8-bit conversion
1496
* has taken place. [GRR: what other steps come before and/or after?]
1500
png_do_read_interlace(png_structp png_ptr)
1502
png_row_infop row_info = &(png_ptr->row_info);
1503
png_bytep row = png_ptr->row_buf + 1;
1504
int pass = png_ptr->pass;
1505
#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1506
png_uint_32 transformations = png_ptr->transformations;
1509
png_debug(1, "in png_do_read_interlace (pnggccrd.c)\n");
1511
#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
1512
if (_mmx_supported == 2) {
1513
#if !defined(PNG_1_0_X)
1514
/* this should have happened in png_init_mmx_flags() already */
1515
png_warning(png_ptr, "asm_flags may not have been initialized");
1521
if (row != NULL && row_info != NULL)
1523
png_uint_32 final_width;
1525
final_width = row_info->width * png_pass_inc[pass];
1527
switch (row_info->pixel_depth)
1533
int s_start, s_end, s_inc;
1538
sp = row + (png_size_t)((row_info->width - 1) >> 3);
1539
dp = row + (png_size_t)((final_width - 1) >> 3);
1540
#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1541
if (transformations & PNG_PACKSWAP)
1543
sshift = (int)((row_info->width + 7) & 7);
1544
dshift = (int)((final_width + 7) & 7);
1552
sshift = 7 - (int)((row_info->width + 7) & 7);
1553
dshift = 7 - (int)((final_width + 7) & 7);
1559
for (i = row_info->width; i; i--)
1561
v = (png_byte)((*sp >> sshift) & 0x1);
1562
for (j = 0; j < png_pass_inc[pass]; j++)
1564
*dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
1565
*dp |= (png_byte)(v << dshift);
1566
if (dshift == s_end)
1574
if (sshift == s_end)
1589
int s_start, s_end, s_inc;
1592
sp = row + (png_size_t)((row_info->width - 1) >> 2);
1593
dp = row + (png_size_t)((final_width - 1) >> 2);
1594
#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1595
if (transformations & PNG_PACKSWAP)
1597
sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
1598
dshift = (png_size_t)(((final_width + 3) & 3) << 1);
1606
sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
1607
dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
1613
for (i = row_info->width; i; i--)
1618
v = (png_byte)((*sp >> sshift) & 0x3);
1619
for (j = 0; j < png_pass_inc[pass]; j++)
1621
*dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
1622
*dp |= (png_byte)(v << dshift);
1623
if (dshift == s_end)
1631
if (sshift == s_end)
1646
int s_start, s_end, s_inc;
1649
sp = row + (png_size_t)((row_info->width - 1) >> 1);
1650
dp = row + (png_size_t)((final_width - 1) >> 1);
1651
#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1652
if (transformations & PNG_PACKSWAP)
1654
sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
1655
dshift = (png_size_t)(((final_width + 1) & 1) << 2);
1663
sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
1664
dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
1670
for (i = row_info->width; i; i--)
1675
v = (png_byte)((*sp >> sshift) & 0xf);
1676
for (j = 0; j < png_pass_inc[pass]; j++)
1678
*dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
1679
*dp |= (png_byte)(v << dshift);
1680
if (dshift == s_end)
1688
if (sshift == s_end)
1699
/*====================================================================*/
1701
default: /* 8-bit or larger (this is where the routine is modified) */
1704
// static unsigned long long _const4 = 0x0000000000FFFFFFLL; no good
1705
// static unsigned long long const4 = 0x0000000000FFFFFFLL; no good
1706
// unsigned long long _const4 = 0x0000000000FFFFFFLL; no good
1707
// unsigned long long const4 = 0x0000000000FFFFFFLL; no good
1711
png_size_t pixel_bytes;
1712
int width = (int)row_info->width;
1714
pixel_bytes = (row_info->pixel_depth >> 3);
1716
/* point sptr at the last pixel in the pre-expanded row: */
1717
sptr = row + (width - 1) * pixel_bytes;
1719
/* point dp at the last pixel position in the expanded row: */
1720
dp = row + (final_width - 1) * pixel_bytes;
1722
/* New code by Nirav Chhatrapati - Intel Corporation */
1724
#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
1725
#if !defined(PNG_1_0_X)
1726
if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE)
1727
/* && _mmx_supported */ )
1732
//--------------------------------------------------------------
1733
if (pixel_bytes == 3)
1735
if (((pass == 0) || (pass == 1)) && width)
1737
int dummy_value_c; // fix 'forbidden register spilled'
1741
__asm__ __volatile__ (
1742
"subl $21, %%edi \n\t"
1743
// (png_pass_inc[pass] - 1)*pixel_bytes
1745
".loop3_pass0: \n\t"
1746
"movd (%%esi), %%mm0 \n\t" // x x x x x 2 1 0
1747
"pand _const4, %%mm0 \n\t" // z z z z z 2 1 0
1748
"movq %%mm0, %%mm1 \n\t" // z z z z z 2 1 0
1749
"psllq $16, %%mm0 \n\t" // z z z 2 1 0 z z
1750
"movq %%mm0, %%mm2 \n\t" // z z z 2 1 0 z z
1751
"psllq $24, %%mm0 \n\t" // 2 1 0 z z z z z
1752
"psrlq $8, %%mm1 \n\t" // z z z z z z 2 1
1753
"por %%mm2, %%mm0 \n\t" // 2 1 0 2 1 0 z z
1754
"por %%mm1, %%mm0 \n\t" // 2 1 0 2 1 0 2 1
1755
"movq %%mm0, %%mm3 \n\t" // 2 1 0 2 1 0 2 1
1756
"psllq $16, %%mm0 \n\t" // 0 2 1 0 2 1 z z
1757
"movq %%mm3, %%mm4 \n\t" // 2 1 0 2 1 0 2 1
1758
"punpckhdq %%mm0, %%mm3 \n\t" // 0 2 1 0 2 1 0 2
1759
"movq %%mm4, 16(%%edi) \n\t"
1760
"psrlq $32, %%mm0 \n\t" // z z z z 0 2 1 0
1761
"movq %%mm3, 8(%%edi) \n\t"
1762
"punpckldq %%mm4, %%mm0 \n\t" // 1 0 2 1 0 2 1 0
1763
"subl $3, %%esi \n\t"
1764
"movq %%mm0, (%%edi) \n\t"
1765
"subl $24, %%edi \n\t"
1767
"jnz .loop3_pass0 \n\t"
1770
: "=c" (dummy_value_c), // output regs (dummy)
1771
"=S" (dummy_value_S),
1772
"=D" (dummy_value_D)
1774
: "1" (sptr), // esi // input regs
1777
"rim" (_const4) // %1(?) (0x0000000000FFFFFFLL)
1779
#if 0 /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
1780
: "%mm0", "%mm1", "%mm2" // clobber list
1785
else if (((pass == 2) || (pass == 3)) && width)
1787
int dummy_value_c; // fix 'forbidden register spilled'
1791
__asm__ __volatile__ (
1792
"subl $9, %%edi \n\t"
1793
// (png_pass_inc[pass] - 1)*pixel_bytes
1795
".loop3_pass2: \n\t"
1796
"movd (%%esi), %%mm0 \n\t" // x x x x x 2 1 0
1797
"pand _const4, %%mm0 \n\t" // z z z z z 2 1 0
1798
"movq %%mm0, %%mm1 \n\t" // z z z z z 2 1 0
1799
"psllq $16, %%mm0 \n\t" // z z z 2 1 0 z z
1800
"movq %%mm0, %%mm2 \n\t" // z z z 2 1 0 z z
1801
"psllq $24, %%mm0 \n\t" // 2 1 0 z z z z z
1802
"psrlq $8, %%mm1 \n\t" // z z z z z z 2 1
1803
"por %%mm2, %%mm0 \n\t" // 2 1 0 2 1 0 z z
1804
"por %%mm1, %%mm0 \n\t" // 2 1 0 2 1 0 2 1
1805
"movq %%mm0, 4(%%edi) \n\t"
1806
"psrlq $16, %%mm0 \n\t" // z z 2 1 0 2 1 0
1807
"subl $3, %%esi \n\t"
1808
"movd %%mm0, (%%edi) \n\t"
1809
"subl $12, %%edi \n\t"
1811
"jnz .loop3_pass2 \n\t"
1814
: "=c" (dummy_value_c), // output regs (dummy)
1815
"=S" (dummy_value_S),
1816
"=D" (dummy_value_D)
1818
: "1" (sptr), // esi // input regs
1821
"rim" (_const4) // (0x0000000000FFFFFFLL)
1823
#if 0 /* %mm0, ..., %mm2 not supported by gcc 2.7.2.3 or egcs 1.1 */
1824
: "%mm0", "%mm1", "%mm2" // clobber list
1828
else if (width) /* && ((pass == 4) || (pass == 5)) */
1830
int width_mmx = ((width >> 1) << 1) - 8; // GRR: huh?
1833
width -= width_mmx; // 8 or 9 pix, 24 or 27 bytes
1836
// png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
1837
// sptr points at last pixel in pre-expanded row
1838
// dp points at last pixel position in expanded row
1839
int dummy_value_c; // fix 'forbidden register spilled'
1843
__asm__ __volatile__ (
1844
"subl $3, %%esi \n\t"
1845
"subl $9, %%edi \n\t"
1846
// (png_pass_inc[pass] + 1)*pixel_bytes
1848
".loop3_pass4: \n\t"
1849
"movq (%%esi), %%mm0 \n\t" // x x 5 4 3 2 1 0
1850
"movq %%mm0, %%mm1 \n\t" // x x 5 4 3 2 1 0
1851
"movq %%mm0, %%mm2 \n\t" // x x 5 4 3 2 1 0
1852
"psllq $24, %%mm0 \n\t" // 4 3 2 1 0 z z z
1853
"pand _const4, %%mm1 \n\t" // z z z z z 2 1 0
1854
"psrlq $24, %%mm2 \n\t" // z z z x x 5 4 3
1855
"por %%mm1, %%mm0 \n\t" // 4 3 2 1 0 2 1 0
1856
"movq %%mm2, %%mm3 \n\t" // z z z x x 5 4 3
1857
"psllq $8, %%mm2 \n\t" // z z x x 5 4 3 z
1858
"movq %%mm0, (%%edi) \n\t"
1859
"psrlq $16, %%mm3 \n\t" // z z z z z x x 5
1860
"pand _const6, %%mm3 \n\t" // z z z z z z z 5
1861
"por %%mm3, %%mm2 \n\t" // z z x x 5 4 3 5
1862
"subl $6, %%esi \n\t"
1863
"movd %%mm2, 8(%%edi) \n\t"
1864
"subl $12, %%edi \n\t"
1865
"subl $2, %%ecx \n\t"
1866
"jnz .loop3_pass4 \n\t"
1869
: "=c" (dummy_value_c), // output regs (dummy)
1870
"=S" (dummy_value_S),
1871
"=D" (dummy_value_D)
1873
: "1" (sptr), // esi // input regs
1875
"0" (width_mmx), // ecx
1876
"rim" (_const4), // 0x0000000000FFFFFFLL
1877
"rim" (_const6) // 0x00000000000000FFLL
1879
#if 0 /* %mm0, ..., %mm3 not supported by gcc 2.7.2.3 or egcs 1.1 */
1880
: "%mm0", "%mm1" // clobber list
1886
sptr -= width_mmx*3;
1888
for (i = width; i; i--)
1893
png_memcpy(v, sptr, 3);
1894
for (j = 0; j < png_pass_inc[pass]; j++)
1896
png_memcpy(dp, v, 3);
1902
} /* end of pixel_bytes == 3 */
1904
//--------------------------------------------------------------
1905
else if (pixel_bytes == 1)
1907
if (((pass == 0) || (pass == 1)) && width)
1909
int width_mmx = ((width >> 2) << 2);
1910
width -= width_mmx; // 0-3 pixels => 0-3 bytes
1913
int dummy_value_c; // fix 'forbidden register spilled'
1917
__asm__ __volatile__ (
1918
"subl $3, %%esi \n\t"
1919
"subl $31, %%edi \n\t"
1921
".loop1_pass0: \n\t"
1922
"movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
1923
"movq %%mm0, %%mm1 \n\t" // x x x x 3 2 1 0
1924
"punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0
1925
"movq %%mm0, %%mm2 \n\t" // 3 3 2 2 1 1 0 0
1926
"punpcklwd %%mm0, %%mm0 \n\t" // 1 1 1 1 0 0 0 0
1927
"movq %%mm0, %%mm3 \n\t" // 1 1 1 1 0 0 0 0
1928
"punpckldq %%mm0, %%mm0 \n\t" // 0 0 0 0 0 0 0 0
1929
"punpckhdq %%mm3, %%mm3 \n\t" // 1 1 1 1 1 1 1 1
1930
"movq %%mm0, (%%edi) \n\t"
1931
"punpckhwd %%mm2, %%mm2 \n\t" // 3 3 3 3 2 2 2 2
1932
"movq %%mm3, 8(%%edi) \n\t"
1933
"movq %%mm2, %%mm4 \n\t" // 3 3 3 3 2 2 2 2
1934
"punpckldq %%mm2, %%mm2 \n\t" // 2 2 2 2 2 2 2 2
1935
"punpckhdq %%mm4, %%mm4 \n\t" // 3 3 3 3 3 3 3 3
1936
"movq %%mm2, 16(%%edi) \n\t"
1937
"subl $4, %%esi \n\t"
1938
"movq %%mm4, 24(%%edi) \n\t"
1939
"subl $32, %%edi \n\t"
1940
"subl $4, %%ecx \n\t"
1941
"jnz .loop1_pass0 \n\t"
1944
: "=c" (dummy_value_c), // output regs (dummy)
1945
"=S" (dummy_value_S),
1946
"=D" (dummy_value_D)
1948
: "1" (sptr), // esi // input regs
1950
"0" (width_mmx) // ecx
1952
#if 0 /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
1953
: "%mm0", "%mm1", "%mm2" // clobber list
1961
for (i = width; i; i--)
1965
/* I simplified this part in version 1.0.4e
1966
* here and in several other instances where
1967
* pixel_bytes == 1 -- GR-P
1972
* png_memcpy(v, sptr, pixel_bytes);
1973
* for (j = 0; j < png_pass_inc[pass]; j++)
1975
* png_memcpy(dp, v, pixel_bytes);
1976
* dp -= pixel_bytes;
1978
* sptr -= pixel_bytes;
1980
* Replacement code is in the next three lines:
1983
for (j = 0; j < png_pass_inc[pass]; j++)
1990
else if (((pass == 2) || (pass == 3)) && width)
1992
int width_mmx = ((width >> 2) << 2);
1993
width -= width_mmx; // 0-3 pixels => 0-3 bytes
1996
int dummy_value_c; // fix 'forbidden register spilled'
2000
__asm__ __volatile__ (
2001
"subl $3, %%esi \n\t"
2002
"subl $15, %%edi \n\t"
2004
".loop1_pass2: \n\t"
2005
"movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
2006
"punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0
2007
"movq %%mm0, %%mm1 \n\t" // 3 3 2 2 1 1 0 0
2008
"punpcklwd %%mm0, %%mm0 \n\t" // 1 1 1 1 0 0 0 0
2009
"punpckhwd %%mm1, %%mm1 \n\t" // 3 3 3 3 2 2 2 2
2010
"movq %%mm0, (%%edi) \n\t"
2011
"subl $4, %%esi \n\t"
2012
"movq %%mm1, 8(%%edi) \n\t"
2013
"subl $16, %%edi \n\t"
2014
"subl $4, %%ecx \n\t"
2015
"jnz .loop1_pass2 \n\t"
2018
: "=c" (dummy_value_c), // output regs (dummy)
2019
"=S" (dummy_value_S),
2020
"=D" (dummy_value_D)
2022
: "1" (sptr), // esi // input regs
2024
"0" (width_mmx) // ecx
2026
#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2027
: "%mm0", "%mm1" // clobber list
2034
for (i = width; i; i--)
2038
for (j = 0; j < png_pass_inc[pass]; j++)
2045
else if (width) /* && ((pass == 4) || (pass == 5)) */
2047
int width_mmx = ((width >> 3) << 3);
2048
width -= width_mmx; // 0-3 pixels => 0-3 bytes
2051
int dummy_value_c; // fix 'forbidden register spilled'
2055
__asm__ __volatile__ (
2056
"subl $7, %%esi \n\t"
2057
"subl $15, %%edi \n\t"
2059
".loop1_pass4: \n\t"
2060
"movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2061
"movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
2062
"punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0
2063
"punpckhbw %%mm1, %%mm1 \n\t" // 7 7 6 6 5 5 4 4
2064
"movq %%mm1, 8(%%edi) \n\t"
2065
"subl $8, %%esi \n\t"
2066
"movq %%mm0, (%%edi) \n\t"
2067
"subl $16, %%edi \n\t"
2068
"subl $8, %%ecx \n\t"
2069
"jnz .loop1_pass4 \n\t"
2072
: "=c" (dummy_value_c), // output regs (none)
2073
"=S" (dummy_value_S),
2074
"=D" (dummy_value_D)
2076
: "1" (sptr), // esi // input regs
2078
"0" (width_mmx) // ecx
2080
#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2081
: "%mm0", "%mm1" // clobber list
2088
for (i = width; i; i--)
2092
for (j = 0; j < png_pass_inc[pass]; j++)
2099
} /* end of pixel_bytes == 1 */
2101
//--------------------------------------------------------------
2102
else if (pixel_bytes == 2)
2104
if (((pass == 0) || (pass == 1)) && width)
2106
int width_mmx = ((width >> 1) << 1);
2107
width -= width_mmx; // 0,1 pixels => 0,2 bytes
2110
int dummy_value_c; // fix 'forbidden register spilled'
2114
__asm__ __volatile__ (
2115
"subl $2, %%esi \n\t"
2116
"subl $30, %%edi \n\t"
2118
".loop2_pass0: \n\t"
2119
"movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
2120
"punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0
2121
"movq %%mm0, %%mm1 \n\t" // 3 2 3 2 1 0 1 0
2122
"punpckldq %%mm0, %%mm0 \n\t" // 1 0 1 0 1 0 1 0
2123
"punpckhdq %%mm1, %%mm1 \n\t" // 3 2 3 2 3 2 3 2
2124
"movq %%mm0, (%%edi) \n\t"
2125
"movq %%mm0, 8(%%edi) \n\t"
2126
"movq %%mm1, 16(%%edi) \n\t"
2127
"subl $4, %%esi \n\t"
2128
"movq %%mm1, 24(%%edi) \n\t"
2129
"subl $32, %%edi \n\t"
2130
"subl $2, %%ecx \n\t"
2131
"jnz .loop2_pass0 \n\t"
2134
: "=c" (dummy_value_c), // output regs (dummy)
2135
"=S" (dummy_value_S),
2136
"=D" (dummy_value_D)
2138
: "1" (sptr), // esi // input regs
2140
"0" (width_mmx) // ecx
2142
#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2143
: "%mm0", "%mm1" // clobber list
2148
sptr -= (width_mmx*2 - 2); // sign fixed
2149
dp -= (width_mmx*16 - 2); // sign fixed
2150
for (i = width; i; i--)
2155
png_memcpy(v, sptr, 2);
2156
for (j = 0; j < png_pass_inc[pass]; j++)
2159
png_memcpy(dp, v, 2);
2163
else if (((pass == 2) || (pass == 3)) && width)
2165
int width_mmx = ((width >> 1) << 1) ;
2166
width -= width_mmx; // 0,1 pixels => 0,2 bytes
2169
int dummy_value_c; // fix 'forbidden register spilled'
2173
__asm__ __volatile__ (
2174
"subl $2, %%esi \n\t"
2175
"subl $14, %%edi \n\t"
2177
".loop2_pass2: \n\t"
2178
"movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
2179
"punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0
2180
"movq %%mm0, %%mm1 \n\t" // 3 2 3 2 1 0 1 0
2181
"punpckldq %%mm0, %%mm0 \n\t" // 1 0 1 0 1 0 1 0
2182
"punpckhdq %%mm1, %%mm1 \n\t" // 3 2 3 2 3 2 3 2
2183
"movq %%mm0, (%%edi) \n\t"
2184
"subl $4, %%esi \n\t"
2185
"movq %%mm1, 8(%%edi) \n\t"
2186
"subl $16, %%edi \n\t"
2187
"subl $2, %%ecx \n\t"
2188
"jnz .loop2_pass2 \n\t"
2191
: "=c" (dummy_value_c), // output regs (dummy)
2192
"=S" (dummy_value_S),
2193
"=D" (dummy_value_D)
2195
: "1" (sptr), // esi // input regs
2197
"0" (width_mmx) // ecx
2199
#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2200
: "%mm0", "%mm1" // clobber list
2205
sptr -= (width_mmx*2 - 2); // sign fixed
2206
dp -= (width_mmx*8 - 2); // sign fixed
2207
for (i = width; i; i--)
2212
png_memcpy(v, sptr, 2);
2213
for (j = 0; j < png_pass_inc[pass]; j++)
2216
png_memcpy(dp, v, 2);
2220
else if (width) // pass == 4 or 5
2222
int width_mmx = ((width >> 1) << 1) ;
2223
width -= width_mmx; // 0,1 pixels => 0,2 bytes
2226
int dummy_value_c; // fix 'forbidden register spilled'
2230
__asm__ __volatile__ (
2231
"subl $2, %%esi \n\t"
2232
"subl $6, %%edi \n\t"
2234
".loop2_pass4: \n\t"
2235
"movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
2236
"punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0
2237
"subl $4, %%esi \n\t"
2238
"movq %%mm0, (%%edi) \n\t"
2239
"subl $8, %%edi \n\t"
2240
"subl $2, %%ecx \n\t"
2241
"jnz .loop2_pass4 \n\t"
2244
: "=c" (dummy_value_c), // output regs (dummy)
2245
"=S" (dummy_value_S),
2246
"=D" (dummy_value_D)
2248
: "1" (sptr), // esi // input regs
2250
"0" (width_mmx) // ecx
2252
#if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2253
: "%mm0" // clobber list
2258
sptr -= (width_mmx*2 - 2); // sign fixed
2259
dp -= (width_mmx*4 - 2); // sign fixed
2260
for (i = width; i; i--)
2265
png_memcpy(v, sptr, 2);
2266
for (j = 0; j < png_pass_inc[pass]; j++)
2269
png_memcpy(dp, v, 2);
2273
} /* end of pixel_bytes == 2 */
2275
//--------------------------------------------------------------
2276
else if (pixel_bytes == 4)
2278
if (((pass == 0) || (pass == 1)) && width)
2280
int width_mmx = ((width >> 1) << 1);
2281
width -= width_mmx; // 0,1 pixels => 0,4 bytes
2284
int dummy_value_c; // fix 'forbidden register spilled'
2288
__asm__ __volatile__ (
2289
"subl $4, %%esi \n\t"
2290
"subl $60, %%edi \n\t"
2292
".loop4_pass0: \n\t"
2293
"movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2294
"movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
2295
"punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
2296
"punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
2297
"movq %%mm0, (%%edi) \n\t"
2298
"movq %%mm0, 8(%%edi) \n\t"
2299
"movq %%mm0, 16(%%edi) \n\t"
2300
"movq %%mm0, 24(%%edi) \n\t"
2301
"movq %%mm1, 32(%%edi) \n\t"
2302
"movq %%mm1, 40(%%edi) \n\t"
2303
"movq %%mm1, 48(%%edi) \n\t"
2304
"subl $8, %%esi \n\t"
2305
"movq %%mm1, 56(%%edi) \n\t"
2306
"subl $64, %%edi \n\t"
2307
"subl $2, %%ecx \n\t"
2308
"jnz .loop4_pass0 \n\t"
2311
: "=c" (dummy_value_c), // output regs (dummy)
2312
"=S" (dummy_value_S),
2313
"=D" (dummy_value_D)
2315
: "1" (sptr), // esi // input regs
2317
"0" (width_mmx) // ecx
2319
#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2320
: "%mm0", "%mm1" // clobber list
2325
sptr -= (width_mmx*4 - 4); // sign fixed
2326
dp -= (width_mmx*32 - 4); // sign fixed
2327
for (i = width; i; i--)
2332
png_memcpy(v, sptr, 4);
2333
for (j = 0; j < png_pass_inc[pass]; j++)
2336
png_memcpy(dp, v, 4);
2340
else if (((pass == 2) || (pass == 3)) && width)
2342
int width_mmx = ((width >> 1) << 1);
2343
width -= width_mmx; // 0,1 pixels => 0,4 bytes
2346
int dummy_value_c; // fix 'forbidden register spilled'
2350
__asm__ __volatile__ (
2351
"subl $4, %%esi \n\t"
2352
"subl $28, %%edi \n\t"
2354
".loop4_pass2: \n\t"
2355
"movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2356
"movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
2357
"punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
2358
"punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
2359
"movq %%mm0, (%%edi) \n\t"
2360
"movq %%mm0, 8(%%edi) \n\t"
2361
"movq %%mm1, 16(%%edi) \n\t"
2362
"movq %%mm1, 24(%%edi) \n\t"
2363
"subl $8, %%esi \n\t"
2364
"subl $32, %%edi \n\t"
2365
"subl $2, %%ecx \n\t"
2366
"jnz .loop4_pass2 \n\t"
2369
: "=c" (dummy_value_c), // output regs (dummy)
2370
"=S" (dummy_value_S),
2371
"=D" (dummy_value_D)
2373
: "1" (sptr), // esi // input regs
2375
"0" (width_mmx) // ecx
2377
#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2378
: "%mm0", "%mm1" // clobber list
2383
sptr -= (width_mmx*4 - 4); // sign fixed
2384
dp -= (width_mmx*16 - 4); // sign fixed
2385
for (i = width; i; i--)
2390
png_memcpy(v, sptr, 4);
2391
for (j = 0; j < png_pass_inc[pass]; j++)
2394
png_memcpy(dp, v, 4);
2398
else if (width) // pass == 4 or 5
2400
int width_mmx = ((width >> 1) << 1) ;
2401
width -= width_mmx; // 0,1 pixels => 0,4 bytes
2404
int dummy_value_c; // fix 'forbidden register spilled'
2408
__asm__ __volatile__ (
2409
"subl $4, %%esi \n\t"
2410
"subl $12, %%edi \n\t"
2412
".loop4_pass4: \n\t"
2413
"movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2414
"movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
2415
"punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
2416
"punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
2417
"movq %%mm0, (%%edi) \n\t"
2418
"subl $8, %%esi \n\t"
2419
"movq %%mm1, 8(%%edi) \n\t"
2420
"subl $16, %%edi \n\t"
2421
"subl $2, %%ecx \n\t"
2422
"jnz .loop4_pass4 \n\t"
2425
: "=c" (dummy_value_c), // output regs (dummy)
2426
"=S" (dummy_value_S),
2427
"=D" (dummy_value_D)
2429
: "1" (sptr), // esi // input regs
2431
"0" (width_mmx) // ecx
2433
#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2434
: "%mm0", "%mm1" // clobber list
2439
sptr -= (width_mmx*4 - 4); // sign fixed
2440
dp -= (width_mmx*8 - 4); // sign fixed
2441
for (i = width; i; i--)
2446
png_memcpy(v, sptr, 4);
2447
for (j = 0; j < png_pass_inc[pass]; j++)
2450
png_memcpy(dp, v, 4);
2454
} /* end of pixel_bytes == 4 */
2456
//--------------------------------------------------------------
2457
else if (pixel_bytes == 8)
2459
// GRR TEST: should work, but needs testing (special 64-bit version of rpng2?)
2460
// GRR NOTE: no need to combine passes here!
2461
if (((pass == 0) || (pass == 1)) && width)
2463
int dummy_value_c; // fix 'forbidden register spilled'
2467
// source is 8-byte RRGGBBAA
2468
// dest is 64-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA ...
2469
__asm__ __volatile__ (
2470
"subl $56, %%edi \n\t" // start of last block
2472
".loop8_pass0: \n\t"
2473
"movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2474
"movq %%mm0, (%%edi) \n\t"
2475
"movq %%mm0, 8(%%edi) \n\t"
2476
"movq %%mm0, 16(%%edi) \n\t"
2477
"movq %%mm0, 24(%%edi) \n\t"
2478
"movq %%mm0, 32(%%edi) \n\t"
2479
"movq %%mm0, 40(%%edi) \n\t"
2480
"movq %%mm0, 48(%%edi) \n\t"
2481
"subl $8, %%esi \n\t"
2482
"movq %%mm0, 56(%%edi) \n\t"
2483
"subl $64, %%edi \n\t"
2485
"jnz .loop8_pass0 \n\t"
2488
: "=c" (dummy_value_c), // output regs (dummy)
2489
"=S" (dummy_value_S),
2490
"=D" (dummy_value_D)
2492
: "1" (sptr), // esi // input regs
2496
#if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2497
: "%mm0" // clobber list
2501
else if (((pass == 2) || (pass == 3)) && width)
2503
// source is 8-byte RRGGBBAA
2504
// dest is 32-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA
2505
// (recall that expansion is _in place_: sptr and dp
2506
// both point at locations within same row buffer)
2508
int dummy_value_c; // fix 'forbidden register spilled'
2512
__asm__ __volatile__ (
2513
"subl $24, %%edi \n\t" // start of last block
2515
".loop8_pass2: \n\t"
2516
"movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2517
"movq %%mm0, (%%edi) \n\t"
2518
"movq %%mm0, 8(%%edi) \n\t"
2519
"movq %%mm0, 16(%%edi) \n\t"
2520
"subl $8, %%esi \n\t"
2521
"movq %%mm0, 24(%%edi) \n\t"
2522
"subl $32, %%edi \n\t"
2524
"jnz .loop8_pass2 \n\t"
2527
: "=c" (dummy_value_c), // output regs (dummy)
2528
"=S" (dummy_value_S),
2529
"=D" (dummy_value_D)
2531
: "1" (sptr), // esi // input regs
2535
#if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2536
: "%mm0" // clobber list
2541
else if (width) // pass == 4 or 5
2543
// source is 8-byte RRGGBBAA
2544
// dest is 16-byte RRGGBBAA RRGGBBAA
2546
int dummy_value_c; // fix 'forbidden register spilled'
2550
__asm__ __volatile__ (
2551
"subl $8, %%edi \n\t" // start of last block
2553
".loop8_pass4: \n\t"
2554
"movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2555
"movq %%mm0, (%%edi) \n\t"
2556
"subl $8, %%esi \n\t"
2557
"movq %%mm0, 8(%%edi) \n\t"
2558
"subl $16, %%edi \n\t"
2560
"jnz .loop8_pass4 \n\t"
2563
: "=c" (dummy_value_c), // output regs (dummy)
2564
"=S" (dummy_value_S),
2565
"=D" (dummy_value_D)
2567
: "1" (sptr), // esi // input regs
2571
#if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2572
: "%mm0" // clobber list
2578
} /* end of pixel_bytes == 8 */
2580
//--------------------------------------------------------------
2581
else if (pixel_bytes == 6)
2583
for (i = width; i; i--)
2587
png_memcpy(v, sptr, 6);
2588
for (j = 0; j < png_pass_inc[pass]; j++)
2590
png_memcpy(dp, v, 6);
2595
} /* end of pixel_bytes == 6 */
2597
//--------------------------------------------------------------
2600
for (i = width; i; i--)
2604
png_memcpy(v, sptr, pixel_bytes);
2605
for (j = 0; j < png_pass_inc[pass]; j++)
2607
png_memcpy(dp, v, pixel_bytes);
2613
} // end of _mmx_supported ========================================
2615
else /* MMX not supported: use modified C code - takes advantage
2616
* of inlining of png_memcpy for a constant */
2617
/* GRR 19991007: does it? or should pixel_bytes in each
2618
* block be replaced with immediate value (e.g., 1)? */
2619
/* GRR 19991017: replaced with constants in each case */
2620
#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
2622
if (pixel_bytes == 1)
2624
for (i = width; i; i--)
2627
for (j = 0; j < png_pass_inc[pass]; j++)
2634
else if (pixel_bytes == 3)
2636
for (i = width; i; i--)
2640
png_memcpy(v, sptr, 3);
2641
for (j = 0; j < png_pass_inc[pass]; j++)
2643
png_memcpy(dp, v, 3);
2649
else if (pixel_bytes == 2)
2651
for (i = width; i; i--)
2655
png_memcpy(v, sptr, 2);
2656
for (j = 0; j < png_pass_inc[pass]; j++)
2658
png_memcpy(dp, v, 2);
2664
else if (pixel_bytes == 4)
2666
for (i = width; i; i--)
2670
png_memcpy(v, sptr, 4);
2671
for (j = 0; j < png_pass_inc[pass]; j++)
2674
if (dp < row || dp+3 > row+png_ptr->row_buf_size)
2676
printf("dp out of bounds: row=%d, dp=%d, rp=%d\n",
2677
row, dp, row+png_ptr->row_buf_size);
2678
printf("row_buf=%d\n",png_ptr->row_buf_size);
2681
png_memcpy(dp, v, 4);
2687
else if (pixel_bytes == 6)
2689
for (i = width; i; i--)
2693
png_memcpy(v, sptr, 6);
2694
for (j = 0; j < png_pass_inc[pass]; j++)
2696
png_memcpy(dp, v, 6);
2702
else if (pixel_bytes == 8)
2704
for (i = width; i; i--)
2708
png_memcpy(v, sptr, 8);
2709
for (j = 0; j < png_pass_inc[pass]; j++)
2711
png_memcpy(dp, v, 8);
2717
else /* GRR: should never be reached */
2719
for (i = width; i; i--)
2723
png_memcpy(v, sptr, pixel_bytes);
2724
for (j = 0; j < png_pass_inc[pass]; j++)
2726
png_memcpy(dp, v, pixel_bytes);
2729
sptr -= pixel_bytes;
2733
} /* end if (MMX not supported) */
2736
} /* end switch (row_info->pixel_depth) */
2738
row_info->width = final_width;
2740
row_info->rowbytes = PNG_ROWBYTES(row_info->pixel_depth,final_width);
2743
} /* end png_do_read_interlace() */
2745
#endif /* PNG_HAVE_ASSEMBLER_READ_INTERLACE */
2746
#endif /* PNG_READ_INTERLACING_SUPPORTED */
2750
#if defined(PNG_HAVE_ASSEMBLER_READ_FILTER_ROW)
2751
#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
2753
// These variables are utilized in the functions below. They are declared
2754
// globally here to ensure alignment on 8-byte boundaries.
2759
} _LBCarryMask = {0x0101010101010101LL},
2760
_HBClearMask = {0x7f7f7f7f7f7f7f7fLL},
2761
_ActiveMask, _ActiveMask2, _ActiveMaskEnd, _ShiftBpp, _ShiftRem;
2763
#ifdef PNG_THREAD_UNSAFE_OK
2764
//===========================================================================//
2766
// P N G _ R E A D _ F I L T E R _ R O W _ M M X _ A V G //
2768
//===========================================================================//
2770
// Optimized code for PNG Average filter decoder
2772
static void /* PRIVATE */
2773
png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
2777
int dummy_value_c; // fix 'forbidden register 2 (cx) was spilled' error
2781
bpp = (row_info->pixel_depth + 7) >> 3; // get # bytes per pixel
2782
_FullLength = row_info->rowbytes; // # of bytes to filter
2784
__asm__ __volatile__ (
2785
// initialize address pointers and offset
2787
"pushl %%ebx \n\t" // save index to Global Offset Table
2789
//pre "movl row, %%edi \n\t" // edi: Avg(x)
2790
"xorl %%ebx, %%ebx \n\t" // ebx: x
2791
"movl %%edi, %%edx \n\t"
2792
//pre "movl prev_row, %%esi \n\t" // esi: Prior(x)
2793
//pre "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
2794
"subl %%ecx, %%edx \n\t" // edx: Raw(x-bpp)
2796
"xorl %%eax,%%eax \n\t"
2798
// Compute the Raw value for the first bpp bytes
2799
// Raw(x) = Avg(x) + (Prior(x)/2)
2801
"movb (%%esi,%%ebx,),%%al \n\t" // load al with Prior(x)
2803
"shrb %%al \n\t" // divide by 2
2804
"addb -1(%%edi,%%ebx,),%%al \n\t" // add Avg(x); -1 to offset inc ebx
2805
//pre "cmpl bpp, %%ebx \n\t" // (bpp is preloaded into ecx)
2806
"cmpl %%ecx, %%ebx \n\t"
2807
"movb %%al,-1(%%edi,%%ebx,) \n\t" // write Raw(x); -1 to offset inc ebx
2808
"jb avg_rlp \n\t" // mov does not affect flags
2810
// get # of bytes to alignment
2811
"movl %%edi, _dif \n\t" // take start of row
2812
"addl %%ebx, _dif \n\t" // add bpp
2813
"addl $0xf, _dif \n\t" // add 7+8 to incr past alignment bdry
2814
"andl $0xfffffff8, _dif \n\t" // mask to alignment boundary
2815
"subl %%edi, _dif \n\t" // subtract from start => value ebx at
2816
"jz avg_go \n\t" // alignment
2819
// Compute the Raw value for the bytes up to the alignment boundary
2820
// Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2821
"xorl %%ecx, %%ecx \n\t"
2824
"xorl %%eax, %%eax \n\t"
2825
"movb (%%esi,%%ebx,), %%cl \n\t" // load cl with Prior(x)
2826
"movb (%%edx,%%ebx,), %%al \n\t" // load al with Raw(x-bpp)
2827
"addw %%cx, %%ax \n\t"
2829
"shrw %%ax \n\t" // divide by 2
2830
"addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
2831
"cmpl _dif, %%ebx \n\t" // check if at alignment boundary
2832
"movb %%al, -1(%%edi,%%ebx,) \n\t" // write Raw(x); -1 to offset inc ebx
2833
"jb avg_lp1 \n\t" // repeat until at alignment boundary
2836
"movl _FullLength, %%eax \n\t"
2837
"movl %%eax, %%ecx \n\t"
2838
"subl %%ebx, %%eax \n\t" // subtract alignment fix
2839
"andl $0x00000007, %%eax \n\t" // calc bytes over mult of 8
2840
"subl %%eax, %%ecx \n\t" // drop over bytes from original length
2841
"movl %%ecx, _MMXLength \n\t"
2843
"popl %%ebx \n\t" // restore index to Global Offset Table
2846
: "=c" (dummy_value_c), // output regs (dummy)
2847
"=S" (dummy_value_S),
2848
"=D" (dummy_value_D)
2850
: "0" (bpp), // ecx // input regs
2851
"1" (prev_row), // esi
2854
: "%eax", "%edx" // clobber list
2858
// GRR: INCLUDE "memory" as clobbered? (_dif, _MMXLength)
2859
// (seems to work fine without...)
2862
// now do the math for the rest of the row
2867
_ActiveMask.use = 0x0000000000ffffffLL;
2868
_ShiftBpp.use = 24; // == 3 * 8
2869
_ShiftRem.use = 40; // == 64 - 24
2871
__asm__ __volatile__ (
2872
// re-init address pointers and offset
2873
"movq _ActiveMask, %%mm7 \n\t"
2874
"movl _dif, %%ecx \n\t" // ecx: x = offset to
2875
"movq _LBCarryMask, %%mm5 \n\t" // alignment boundary
2876
// preload "movl row, %%edi \n\t" // edi: Avg(x)
2877
"movq _HBClearMask, %%mm4 \n\t"
2878
// preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
2880
// prime the pump: load the first Raw(x-bpp) data set
2881
"movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
2882
// (correct pos. in loop below)
2884
"movq (%%edi,%%ecx,), %%mm0 \n\t" // load mm0 with Avg(x)
2885
"movq %%mm5, %%mm3 \n\t"
2886
"psrlq _ShiftRem, %%mm2 \n\t" // correct position Raw(x-bpp)
2888
"movq (%%esi,%%ecx,), %%mm1 \n\t" // load mm1 with Prior(x)
2889
"movq %%mm7, %%mm6 \n\t"
2890
"pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
2891
"psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
2892
"pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
2894
"paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
2896
// add 1st active group (Raw(x-bpp)/2) to average with LBCarry
2897
"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
2899
"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
2901
// lsb's were == 1 (only valid for active group)
2902
"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2903
"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
2905
"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
2907
"pand %%mm6, %%mm2 \n\t" // leave only Active Group 1
2908
// bytes to add to Avg
2909
"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
2910
// Avg for each Active
2912
// add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
2913
"psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover
2915
"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
2916
"psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
2917
"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
2919
"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
2921
// lsb's were == 1 (only valid for active group)
2922
"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2923
"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
2925
"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
2927
"pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
2928
// bytes to add to Avg
2929
"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
2930
// Avg for each Active
2933
// add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
2934
"psllq _ShiftBpp, %%mm6 \n\t" // shift mm6 mask to cover last
2937
"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
2938
"psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
2939
// Data only needs to be shifted once here to
2940
// get the correct x-bpp offset.
2941
"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
2943
"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
2945
// lsb's were == 1 (only valid for active group)
2946
"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2947
"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
2949
"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
2951
"pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
2952
// bytes to add to Avg
2953
"addl $8, %%ecx \n\t"
2954
"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
2955
// Avg for each Active
2957
// now ready to write back to memory
2958
"movq %%mm0, -8(%%edi,%%ecx,) \n\t"
2959
// move updated Raw(x) to use as Raw(x-bpp) for next loop
2960
"cmpl _MMXLength, %%ecx \n\t"
2961
"movq %%mm0, %%mm2 \n\t" // mov updated Raw(x) to mm2
2964
: "=S" (dummy_value_S), // output regs (dummy)
2965
"=D" (dummy_value_D)
2967
: "0" (prev_row), // esi // input regs
2970
: "%ecx" // clobber list
2971
#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
2972
, "%mm0", "%mm1", "%mm2", "%mm3"
2973
, "%mm4", "%mm5", "%mm6", "%mm7"
2981
//case 7: // who wrote this? PNG doesn't support 5 or 7 bytes/pixel
2982
//case 5: // GRR BOGUS
2984
_ActiveMask.use = 0xffffffffffffffffLL; // use shift below to clear
2985
// appropriate inactive bytes
2986
_ShiftBpp.use = bpp << 3;
2987
_ShiftRem.use = 64 - _ShiftBpp.use;
2989
__asm__ __volatile__ (
2990
"movq _HBClearMask, %%mm4 \n\t"
2992
// re-init address pointers and offset
2993
"movl _dif, %%ecx \n\t" // ecx: x = offset to
2994
// alignment boundary
2996
// load _ActiveMask and clear all bytes except for 1st active group
2997
"movq _ActiveMask, %%mm7 \n\t"
2998
// preload "movl row, %%edi \n\t" // edi: Avg(x)
2999
"psrlq _ShiftRem, %%mm7 \n\t"
3000
// preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
3001
"movq %%mm7, %%mm6 \n\t"
3002
"movq _LBCarryMask, %%mm5 \n\t"
3003
"psllq _ShiftBpp, %%mm6 \n\t" // create mask for 2nd active
3006
// prime the pump: load the first Raw(x-bpp) data set
3007
"movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
3008
// (we correct pos. in loop below)
3010
"movq (%%edi,%%ecx,), %%mm0 \n\t"
3011
"psrlq _ShiftRem, %%mm2 \n\t" // shift data to pos. correctly
3012
"movq (%%esi,%%ecx,), %%mm1 \n\t"
3013
// add (Prev_row/2) to average
3014
"movq %%mm5, %%mm3 \n\t"
3015
"pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
3016
"psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
3017
"pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
3019
"paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
3021
// add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
3022
"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
3024
"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
3026
// lsb's were == 1 (only valid for active group)
3027
"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3028
"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3030
"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3032
"pand %%mm7, %%mm2 \n\t" // leave only Active Group 1
3033
// bytes to add to Avg
3034
"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg
3037
// add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
3038
"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
3039
"psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
3040
"addl $8, %%ecx \n\t"
3041
"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
3043
"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
3045
// lsb's were == 1 (only valid for active group)
3046
"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3047
"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3049
"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3051
"pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
3052
// bytes to add to Avg
3053
"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
3054
// Avg for each Active
3056
"cmpl _MMXLength, %%ecx \n\t"
3057
// now ready to write back to memory
3058
"movq %%mm0, -8(%%edi,%%ecx,) \n\t"
3059
// prep Raw(x-bpp) for next loop
3060
"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
3063
: "=S" (dummy_value_S), // output regs (dummy)
3064
"=D" (dummy_value_D)
3066
: "0" (prev_row), // esi // input regs
3069
: "%ecx" // clobber list
3070
#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3071
, "%mm0", "%mm1", "%mm2", "%mm3"
3072
, "%mm4", "%mm5", "%mm6", "%mm7"
3076
break; // end 4,6 bpp
3080
_ActiveMask.use = 0x000000000000ffffLL;
3081
_ShiftBpp.use = 16; // == 2 * 8
3082
_ShiftRem.use = 48; // == 64 - 16
3084
__asm__ __volatile__ (
3086
"movq _ActiveMask, %%mm7 \n\t"
3087
// re-init address pointers and offset
3088
"movl _dif, %%ecx \n\t" // ecx: x = offset to alignment
3090
"movq _LBCarryMask, %%mm5 \n\t"
3091
// preload "movl row, %%edi \n\t" // edi: Avg(x)
3092
"movq _HBClearMask, %%mm4 \n\t"
3093
// preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
3095
// prime the pump: load the first Raw(x-bpp) data set
3096
"movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
3097
// (we correct pos. in loop below)
3099
"movq (%%edi,%%ecx,), %%mm0 \n\t"
3100
"psrlq _ShiftRem, %%mm2 \n\t" // shift data to pos. correctly
3101
"movq (%%esi,%%ecx,), %%mm1 \n\t" // (GRR BUGFIX: was psllq)
3102
// add (Prev_row/2) to average
3103
"movq %%mm5, %%mm3 \n\t"
3104
"pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
3105
"psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
3106
"pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
3108
"movq %%mm7, %%mm6 \n\t"
3109
"paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
3112
// add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
3113
"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
3115
"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
3117
// lsb's were == 1 (only valid
3118
// for active group)
3119
"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3120
"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3122
"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3124
"pand %%mm6, %%mm2 \n\t" // leave only Active Group 1
3125
// bytes to add to Avg
3126
"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg
3127
// for each Active byte
3129
// add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
3130
"psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover
3132
"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
3133
"psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
3134
"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
3136
"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
3138
// lsb's were == 1 (only valid
3139
// for active group)
3140
"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3141
"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3143
"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3145
"pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
3146
// bytes to add to Avg
3147
"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
3148
// Avg for each Active byte
3150
// add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
3151
"psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover
3153
"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
3154
"psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
3155
"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
3157
"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
3158
// where both lsb's were == 1
3159
// (only valid for active group)
3160
"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3161
"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3163
"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3165
"pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
3166
// bytes to add to Avg
3167
"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
3168
// Avg for each Active byte
3170
// add 4th active group (Raw(x-bpp)/2) to average with _LBCarry
3171
"psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover
3173
"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
3174
"psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
3175
"addl $8, %%ecx \n\t"
3176
"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
3178
"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
3180
// lsb's were == 1 (only valid
3181
// for active group)
3182
"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3183
"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3185
"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3187
"pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
3188
// bytes to add to Avg
3189
"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
3190
// Avg for each Active byte
3192
"cmpl _MMXLength, %%ecx \n\t"
3193
// now ready to write back to memory
3194
"movq %%mm0, -8(%%edi,%%ecx,) \n\t"
3195
// prep Raw(x-bpp) for next loop
3196
"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
3199
: "=S" (dummy_value_S), // output regs (dummy)
3200
"=D" (dummy_value_D)
3202
: "0" (prev_row), // esi // input regs
3205
: "%ecx" // clobber list
3206
#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3207
, "%mm0", "%mm1", "%mm2", "%mm3"
3208
, "%mm4", "%mm5", "%mm6", "%mm7"
3216
__asm__ __volatile__ (
3217
// re-init address pointers and offset
3219
"pushl %%ebx \n\t" // save Global Offset Table index
3221
"movl _dif, %%ebx \n\t" // ebx: x = offset to alignment
3223
// preload "movl row, %%edi \n\t" // edi: Avg(x)
3224
"cmpl _FullLength, %%ebx \n\t" // test if offset at end of array
3226
// do Paeth decode for remaining bytes
3227
// preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
3228
"movl %%edi, %%edx \n\t"
3229
// preload "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
3230
"subl %%ecx, %%edx \n\t" // edx: Raw(x-bpp)
3231
"xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx
3234
// Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
3235
"xorl %%eax, %%eax \n\t"
3236
"movb (%%esi,%%ebx,), %%cl \n\t" // load cl with Prior(x)
3237
"movb (%%edx,%%ebx,), %%al \n\t" // load al with Raw(x-bpp)
3238
"addw %%cx, %%ax \n\t"
3240
"shrw %%ax \n\t" // divide by 2
3241
"addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset
3243
"cmpl _FullLength, %%ebx \n\t" // check if at end of array
3244
"movb %%al, -1(%%edi,%%ebx,) \n\t" // write back Raw(x);
3245
// mov does not affect flags; -1 to offset inc ebx
3250
"popl %%ebx \n\t" // Global Offset Table index
3253
: "=c" (dummy_value_c), // output regs (dummy)
3254
"=S" (dummy_value_S),
3255
"=D" (dummy_value_D)
3257
: "0" (bpp), // ecx // input regs
3258
"1" (prev_row), // esi
3261
: "%eax", "%edx" // clobber list
3267
return; // end 1 bpp
3271
__asm__ __volatile__ (
3272
// re-init address pointers and offset
3273
"movl _dif, %%ecx \n\t" // ecx: x == offset to alignment
3274
"movq _LBCarryMask, %%mm5 \n\t" // boundary
3275
// preload "movl row, %%edi \n\t" // edi: Avg(x)
3276
"movq _HBClearMask, %%mm4 \n\t"
3277
// preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
3279
// prime the pump: load the first Raw(x-bpp) data set
3280
"movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
3281
// (NO NEED to correct pos. in loop below)
3284
"movq (%%edi,%%ecx,), %%mm0 \n\t"
3285
"movq %%mm5, %%mm3 \n\t"
3286
"movq (%%esi,%%ecx,), %%mm1 \n\t"
3287
"addl $8, %%ecx \n\t"
3288
"pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
3289
"psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
3290
"pand %%mm2, %%mm3 \n\t" // get LBCarrys for each byte
3291
// where both lsb's were == 1
3292
"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3293
"pand %%mm4, %%mm1 \n\t" // clear invalid bit 7, each byte
3294
"paddb %%mm3, %%mm0 \n\t" // add LBCarrys to Avg, each byte
3295
"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7, each byte
3296
"paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg, each
3297
"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) to Avg for each
3298
"cmpl _MMXLength, %%ecx \n\t"
3299
"movq %%mm0, -8(%%edi,%%ecx,) \n\t"
3300
"movq %%mm0, %%mm2 \n\t" // reuse as Raw(x-bpp)
3303
: "=S" (dummy_value_S), // output regs (dummy)
3304
"=D" (dummy_value_D)
3306
: "0" (prev_row), // esi // input regs
3309
: "%ecx" // clobber list
3310
#if 0 /* %mm0, ..., %mm5 not supported by gcc 2.7.2.3 or egcs 1.1 */
3311
, "%mm0", "%mm1", "%mm2"
3312
, "%mm3", "%mm4", "%mm5"
3318
default: // bpp greater than 8 (!= 1,2,3,4,[5],6,[7],8)
3322
// GRR: PRINT ERROR HERE: SHOULD NEVER BE REACHED
3324
"Internal logic error in pnggccrd (png_read_filter_row_mmx_avg())\n");
3328
__asm__ __volatile__ (
3329
"movq _LBCarryMask, %%mm5 \n\t"
3330
// re-init address pointers and offset
3331
"movl _dif, %%ebx \n\t" // ebx: x = offset to
3332
// alignment boundary
3333
"movl row, %%edi \n\t" // edi: Avg(x)
3334
"movq _HBClearMask, %%mm4 \n\t"
3335
"movl %%edi, %%edx \n\t"
3336
"movl prev_row, %%esi \n\t" // esi: Prior(x)
3337
"subl bpp, %%edx \n\t" // edx: Raw(x-bpp)
3339
"movq (%%edi,%%ebx,), %%mm0 \n\t"
3340
"movq %%mm5, %%mm3 \n\t"
3341
"movq (%%esi,%%ebx,), %%mm1 \n\t"
3342
"pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
3343
"movq (%%edx,%%ebx,), %%mm2 \n\t"
3344
"psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
3345
"pand %%mm2, %%mm3 \n\t" // get LBCarrys for each byte
3346
// where both lsb's were == 1
3347
"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3348
"pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
3350
"paddb %%mm3, %%mm0 \n\t" // add LBCarrys to Avg for each
3352
"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
3354
"paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
3356
"addl $8, %%ebx \n\t"
3357
"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) to Avg for each
3359
"cmpl _MMXLength, %%ebx \n\t"
3360
"movq %%mm0, -8(%%edi,%%ebx,) \n\t"
3363
: // FIXASM: output regs/vars go here, e.g.: "=m" (memory_var)
3365
: // FIXASM: input regs, e.g.: "c" (count), "S" (src), "D" (dest)
3367
: "%ebx", "%edx", "%edi", "%esi" // CHECKASM: clobber list
3369
#endif /* 0 - NEVER REACHED */
3373
} // end switch (bpp)
3375
__asm__ __volatile__ (
3376
// MMX acceleration complete; now do clean-up
3377
// check if any remaining bytes left to decode
3379
"pushl %%ebx \n\t" // save index to Global Offset Table
3381
"movl _MMXLength, %%ebx \n\t" // ebx: x == offset bytes after MMX
3382
//pre "movl row, %%edi \n\t" // edi: Avg(x)
3383
"cmpl _FullLength, %%ebx \n\t" // test if offset at end of array
3386
// do Avg decode for remaining bytes
3387
//pre "movl prev_row, %%esi \n\t" // esi: Prior(x)
3388
"movl %%edi, %%edx \n\t"
3389
//pre "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
3390
"subl %%ecx, %%edx \n\t" // edx: Raw(x-bpp)
3391
"xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx below
3394
// Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
3395
"xorl %%eax, %%eax \n\t"
3396
"movb (%%esi,%%ebx,), %%cl \n\t" // load cl with Prior(x)
3397
"movb (%%edx,%%ebx,), %%al \n\t" // load al with Raw(x-bpp)
3398
"addw %%cx, %%ax \n\t"
3400
"shrw %%ax \n\t" // divide by 2
3401
"addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
3402
"cmpl _FullLength, %%ebx \n\t" // check if at end of array
3403
"movb %%al, -1(%%edi,%%ebx,) \n\t" // write back Raw(x) [mov does not
3404
"jb avg_lp2 \n\t" // affect flags; -1 to offset inc ebx]
3407
"EMMS \n\t" // end MMX; prep for poss. FP instrs.
3409
"popl %%ebx \n\t" // restore index to Global Offset Table
3412
: "=c" (dummy_value_c), // output regs (dummy)
3413
"=S" (dummy_value_S),
3414
"=D" (dummy_value_D)
3416
: "0" (bpp), // ecx // input regs
3417
"1" (prev_row), // esi
3420
: "%eax", "%edx" // clobber list
3426
} /* end png_read_filter_row_mmx_avg() */
3431
#ifdef PNG_THREAD_UNSAFE_OK
3432
//===========================================================================//
3434
// P N G _ R E A D _ F I L T E R _ R O W _ M M X _ P A E T H //
3436
//===========================================================================//
3438
// Optimized code for PNG Paeth filter decoder
3440
static void /* PRIVATE */
3441
png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
3445
int dummy_value_c; // fix 'forbidden register 2 (cx) was spilled' error
3449
bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
3450
_FullLength = row_info->rowbytes; // # of bytes to filter
3452
__asm__ __volatile__ (
3454
"pushl %%ebx \n\t" // save index to Global Offset Table
3456
"xorl %%ebx, %%ebx \n\t" // ebx: x offset
3457
//pre "movl row, %%edi \n\t"
3458
"xorl %%edx, %%edx \n\t" // edx: x-bpp offset
3459
//pre "movl prev_row, %%esi \n\t"
3460
"xorl %%eax, %%eax \n\t"
3462
// Compute the Raw value for the first bpp bytes
3463
// Note: the formula works out to be always
3464
// Paeth(x) = Raw(x) + Prior(x) where x < bpp
3466
"movb (%%edi,%%ebx,), %%al \n\t"
3467
"addb (%%esi,%%ebx,), %%al \n\t"
3469
//pre "cmpl bpp, %%ebx \n\t" (bpp is preloaded into ecx)
3470
"cmpl %%ecx, %%ebx \n\t"
3471
"movb %%al, -1(%%edi,%%ebx,) \n\t"
3473
// get # of bytes to alignment
3474
"movl %%edi, _dif \n\t" // take start of row
3475
"addl %%ebx, _dif \n\t" // add bpp
3476
"xorl %%ecx, %%ecx \n\t"
3477
"addl $0xf, _dif \n\t" // add 7 + 8 to incr past alignment
3479
"andl $0xfffffff8, _dif \n\t" // mask to alignment boundary
3480
"subl %%edi, _dif \n\t" // subtract from start ==> value ebx
3486
"xorl %%eax, %%eax \n\t"
3487
// pav = p - a = (a + b - c) - a = b - c
3488
"movb (%%esi,%%ebx,), %%al \n\t" // load Prior(x) into al
3489
"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
3490
"subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
3491
"movl %%eax, _patemp \n\t" // Save pav for later use
3492
"xorl %%eax, %%eax \n\t"
3493
// pbv = p - b = (a + b - c) - b = a - c
3494
"movb (%%edi,%%edx,), %%al \n\t" // load Raw(x-bpp) into al
3495
"subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
3496
"movl %%eax, %%ecx \n\t"
3497
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3498
"addl _patemp, %%eax \n\t" // pcv = pav + pbv
3500
"testl $0x80000000, %%eax \n\t"
3502
"negl %%eax \n\t" // reverse sign of neg values
3505
"movl %%eax, _pctemp \n\t" // save pc for later use
3507
"testl $0x80000000, %%ecx \n\t"
3509
"negl %%ecx \n\t" // reverse sign of neg values
3512
"movl %%ecx, _pbtemp \n\t" // save pb for later use
3514
"movl _patemp, %%eax \n\t"
3515
"testl $0x80000000, %%eax \n\t"
3517
"negl %%eax \n\t" // reverse sign of neg values
3520
"movl %%eax, _patemp \n\t" // save pa for later use
3522
"cmpl %%ecx, %%eax \n\t"
3523
"jna paeth_abb \n\t"
3524
// pa > pb; now test if pb <= pc
3525
"cmpl _pctemp, %%ecx \n\t"
3526
"jna paeth_bbc \n\t"
3527
// pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3528
"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
3529
"jmp paeth_paeth \n\t"
3532
// pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3533
"movb (%%esi,%%ebx,), %%cl \n\t" // load Prior(x) into cl
3534
"jmp paeth_paeth \n\t"
3537
// pa <= pb; now test if pa <= pc
3538
"cmpl _pctemp, %%eax \n\t"
3539
"jna paeth_abc \n\t"
3540
// pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3541
"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
3542
"jmp paeth_paeth \n\t"
3545
// pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3546
"movb (%%edi,%%edx,), %%cl \n\t" // load Raw(x-bpp) into cl
3551
// Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3552
"addb %%cl, -1(%%edi,%%ebx,) \n\t"
3553
"cmpl _dif, %%ebx \n\t"
3557
"movl _FullLength, %%ecx \n\t"
3558
"movl %%ecx, %%eax \n\t"
3559
"subl %%ebx, %%eax \n\t" // subtract alignment fix
3560
"andl $0x00000007, %%eax \n\t" // calc bytes over mult of 8
3561
"subl %%eax, %%ecx \n\t" // drop over bytes from original length
3562
"movl %%ecx, _MMXLength \n\t"
3564
"popl %%ebx \n\t" // restore index to Global Offset Table
3567
: "=c" (dummy_value_c), // output regs (dummy)
3568
"=S" (dummy_value_S),
3569
"=D" (dummy_value_D)
3571
: "0" (bpp), // ecx // input regs
3572
"1" (prev_row), // esi
3575
: "%eax", "%edx" // clobber list
3581
// now do the math for the rest of the row
3586
_ActiveMask.use = 0x0000000000ffffffLL;
3587
_ActiveMaskEnd.use = 0xffff000000000000LL;
3588
_ShiftBpp.use = 24; // == bpp(3) * 8
3589
_ShiftRem.use = 40; // == 64 - 24
3591
__asm__ __volatile__ (
3592
"movl _dif, %%ecx \n\t"
3593
// preload "movl row, %%edi \n\t"
3594
// preload "movl prev_row, %%esi \n\t"
3595
"pxor %%mm0, %%mm0 \n\t"
3596
// prime the pump: load the first Raw(x-bpp) data set
3597
"movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3599
"psrlq _ShiftRem, %%mm1 \n\t" // shift last 3 bytes to 1st
3601
"movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
3602
"punpcklbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
3603
"movq -8(%%esi,%%ecx,), %%mm3 \n\t" // prep c=Prior(x-bpp) bytes
3604
"punpcklbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
3605
"psrlq _ShiftRem, %%mm3 \n\t" // shift last 3 bytes to 1st
3607
// pav = p - a = (a + b - c) - a = b - c
3608
"movq %%mm2, %%mm4 \n\t"
3609
"punpcklbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3610
// pbv = p - b = (a + b - c) - b = a - c
3611
"movq %%mm1, %%mm5 \n\t"
3612
"psubw %%mm3, %%mm4 \n\t"
3613
"pxor %%mm7, %%mm7 \n\t"
3614
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3615
"movq %%mm4, %%mm6 \n\t"
3616
"psubw %%mm3, %%mm5 \n\t"
3618
// pa = abs(p-a) = abs(pav)
3619
// pb = abs(p-b) = abs(pbv)
3620
// pc = abs(p-c) = abs(pcv)
3621
"pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
3622
"paddw %%mm5, %%mm6 \n\t"
3623
"pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
3624
"pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
3625
"psubw %%mm0, %%mm4 \n\t"
3626
"pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
3627
"psubw %%mm0, %%mm4 \n\t"
3628
"psubw %%mm7, %%mm5 \n\t"
3629
"pxor %%mm0, %%mm0 \n\t"
3630
"pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3631
"pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3632
"psubw %%mm7, %%mm5 \n\t"
3633
"psubw %%mm0, %%mm6 \n\t"
3635
"movq %%mm4, %%mm7 \n\t"
3636
"psubw %%mm0, %%mm6 \n\t"
3637
"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3638
"movq %%mm7, %%mm0 \n\t"
3639
// use mm7 mask to merge pa & pb
3640
"pand %%mm7, %%mm5 \n\t"
3641
// use mm0 mask copy to merge a & b
3642
"pand %%mm0, %%mm2 \n\t"
3643
"pandn %%mm4, %%mm7 \n\t"
3644
"pandn %%mm1, %%mm0 \n\t"
3645
"paddw %%mm5, %%mm7 \n\t"
3646
"paddw %%mm2, %%mm0 \n\t"
3647
// test ((pa <= pb)? pa:pb) <= pc
3648
"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3649
"pxor %%mm1, %%mm1 \n\t"
3650
"pand %%mm7, %%mm3 \n\t"
3651
"pandn %%mm0, %%mm7 \n\t"
3652
"paddw %%mm3, %%mm7 \n\t"
3653
"pxor %%mm0, %%mm0 \n\t"
3654
"packuswb %%mm1, %%mm7 \n\t"
3655
"movq (%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
3656
"pand _ActiveMask, %%mm7 \n\t"
3657
"movq %%mm3, %%mm2 \n\t" // load b=Prior(x) step 1
3658
"paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
3659
"punpcklbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3660
"movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
3661
"movq %%mm7, %%mm1 \n\t" // now mm1 will be used as
3663
// now do Paeth for 2nd set of bytes (3-5)
3664
"psrlq _ShiftBpp, %%mm2 \n\t" // load b=Prior(x) step 2
3665
"punpcklbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
3666
"pxor %%mm7, %%mm7 \n\t"
3667
"punpcklbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
3668
// pbv = p - b = (a + b - c) - b = a - c
3669
"movq %%mm1, %%mm5 \n\t"
3670
// pav = p - a = (a + b - c) - a = b - c
3671
"movq %%mm2, %%mm4 \n\t"
3672
"psubw %%mm3, %%mm5 \n\t"
3673
"psubw %%mm3, %%mm4 \n\t"
3674
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
3675
// pav + pbv = pbv + pav
3676
"movq %%mm5, %%mm6 \n\t"
3677
"paddw %%mm4, %%mm6 \n\t"
3679
// pa = abs(p-a) = abs(pav)
3680
// pb = abs(p-b) = abs(pbv)
3681
// pc = abs(p-c) = abs(pcv)
3682
"pcmpgtw %%mm5, %%mm0 \n\t" // create mask pbv bytes < 0
3683
"pcmpgtw %%mm4, %%mm7 \n\t" // create mask pav bytes < 0
3684
"pand %%mm5, %%mm0 \n\t" // only pbv bytes < 0 in mm0
3685
"pand %%mm4, %%mm7 \n\t" // only pav bytes < 0 in mm7
3686
"psubw %%mm0, %%mm5 \n\t"
3687
"psubw %%mm7, %%mm4 \n\t"
3688
"psubw %%mm0, %%mm5 \n\t"
3689
"psubw %%mm7, %%mm4 \n\t"
3690
"pxor %%mm0, %%mm0 \n\t"
3691
"pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3692
"pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3693
"psubw %%mm0, %%mm6 \n\t"
3695
"movq %%mm4, %%mm7 \n\t"
3696
"psubw %%mm0, %%mm6 \n\t"
3697
"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3698
"movq %%mm7, %%mm0 \n\t"
3699
// use mm7 mask to merge pa & pb
3700
"pand %%mm7, %%mm5 \n\t"
3701
// use mm0 mask copy to merge a & b
3702
"pand %%mm0, %%mm2 \n\t"
3703
"pandn %%mm4, %%mm7 \n\t"
3704
"pandn %%mm1, %%mm0 \n\t"
3705
"paddw %%mm5, %%mm7 \n\t"
3706
"paddw %%mm2, %%mm0 \n\t"
3707
// test ((pa <= pb)? pa:pb) <= pc
3708
"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3709
"movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
3710
"pand %%mm7, %%mm3 \n\t"
3711
"pandn %%mm0, %%mm7 \n\t"
3712
"pxor %%mm1, %%mm1 \n\t"
3713
"paddw %%mm3, %%mm7 \n\t"
3714
"pxor %%mm0, %%mm0 \n\t"
3715
"packuswb %%mm1, %%mm7 \n\t"
3716
"movq %%mm2, %%mm3 \n\t" // load c=Prior(x-bpp) step 1
3717
"pand _ActiveMask, %%mm7 \n\t"
3718
"punpckhbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
3719
"psllq _ShiftBpp, %%mm7 \n\t" // shift bytes to 2nd group of
3721
// pav = p - a = (a + b - c) - a = b - c
3722
"movq %%mm2, %%mm4 \n\t"
3723
"paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
3724
"psllq _ShiftBpp, %%mm3 \n\t" // load c=Prior(x-bpp) step 2
3725
"movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
3726
"movq %%mm7, %%mm1 \n\t"
3727
"punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3728
"psllq _ShiftBpp, %%mm1 \n\t" // shift bytes
3729
// now mm1 will be used as Raw(x-bpp)
3730
// now do Paeth for 3rd, and final, set of bytes (6-7)
3731
"pxor %%mm7, %%mm7 \n\t"
3732
"punpckhbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
3733
"psubw %%mm3, %%mm4 \n\t"
3734
// pbv = p - b = (a + b - c) - b = a - c
3735
"movq %%mm1, %%mm5 \n\t"
3736
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3737
"movq %%mm4, %%mm6 \n\t"
3738
"psubw %%mm3, %%mm5 \n\t"
3739
"pxor %%mm0, %%mm0 \n\t"
3740
"paddw %%mm5, %%mm6 \n\t"
3742
// pa = abs(p-a) = abs(pav)
3743
// pb = abs(p-b) = abs(pbv)
3744
// pc = abs(p-c) = abs(pcv)
3745
"pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
3746
"pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
3747
"pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
3748
"pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
3749
"psubw %%mm0, %%mm4 \n\t"
3750
"psubw %%mm7, %%mm5 \n\t"
3751
"psubw %%mm0, %%mm4 \n\t"
3752
"psubw %%mm7, %%mm5 \n\t"
3753
"pxor %%mm0, %%mm0 \n\t"
3754
"pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3755
"pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3756
"psubw %%mm0, %%mm6 \n\t"
3758
"movq %%mm4, %%mm7 \n\t"
3759
"psubw %%mm0, %%mm6 \n\t"
3760
"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3761
"movq %%mm7, %%mm0 \n\t"
3762
// use mm0 mask copy to merge a & b
3763
"pand %%mm0, %%mm2 \n\t"
3764
// use mm7 mask to merge pa & pb
3765
"pand %%mm7, %%mm5 \n\t"
3766
"pandn %%mm1, %%mm0 \n\t"
3767
"pandn %%mm4, %%mm7 \n\t"
3768
"paddw %%mm2, %%mm0 \n\t"
3769
"paddw %%mm5, %%mm7 \n\t"
3770
// test ((pa <= pb)? pa:pb) <= pc
3771
"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3772
"pand %%mm7, %%mm3 \n\t"
3773
"pandn %%mm0, %%mm7 \n\t"
3774
"paddw %%mm3, %%mm7 \n\t"
3775
"pxor %%mm1, %%mm1 \n\t"
3776
"packuswb %%mm7, %%mm1 \n\t"
3777
// step ecx to next set of 8 bytes and repeat loop til done
3778
"addl $8, %%ecx \n\t"
3779
"pand _ActiveMaskEnd, %%mm1 \n\t"
3780
"paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with
3783
"cmpl _MMXLength, %%ecx \n\t"
3784
"pxor %%mm0, %%mm0 \n\t" // pxor does not affect flags
3785
"movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
3786
// mm1 will be used as Raw(x-bpp) next loop
3787
// mm3 ready to be used as Prior(x-bpp) next loop
3790
: "=S" (dummy_value_S), // output regs (dummy)
3791
"=D" (dummy_value_D)
3793
: "0" (prev_row), // esi // input regs
3796
: "%ecx" // clobber list
3797
#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3798
, "%mm0", "%mm1", "%mm2", "%mm3"
3799
, "%mm4", "%mm5", "%mm6", "%mm7"
3806
//case 7: // GRR BOGUS
3807
//case 5: // GRR BOGUS
3809
_ActiveMask.use = 0x00000000ffffffffLL;
3810
_ActiveMask2.use = 0xffffffff00000000LL;
3811
_ShiftBpp.use = bpp << 3; // == bpp * 8
3812
_ShiftRem.use = 64 - _ShiftBpp.use;
3814
__asm__ __volatile__ (
3815
"movl _dif, %%ecx \n\t"
3816
// preload "movl row, %%edi \n\t"
3817
// preload "movl prev_row, %%esi \n\t"
3818
// prime the pump: load the first Raw(x-bpp) data set
3819
"movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3820
"pxor %%mm0, %%mm0 \n\t"
3823
// must shift to position Raw(x-bpp) data
3824
"psrlq _ShiftRem, %%mm1 \n\t"
3825
// do first set of 4 bytes
3826
"movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
3827
"punpcklbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
3828
"movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
3829
"punpcklbw %%mm0, %%mm2 \n\t" // unpack Low bytes of b
3830
// must shift to position Prior(x-bpp) data
3831
"psrlq _ShiftRem, %%mm3 \n\t"
3832
// pav = p - a = (a + b - c) - a = b - c
3833
"movq %%mm2, %%mm4 \n\t"
3834
"punpcklbw %%mm0, %%mm3 \n\t" // unpack Low bytes of c
3835
// pbv = p - b = (a + b - c) - b = a - c
3836
"movq %%mm1, %%mm5 \n\t"
3837
"psubw %%mm3, %%mm4 \n\t"
3838
"pxor %%mm7, %%mm7 \n\t"
3839
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3840
"movq %%mm4, %%mm6 \n\t"
3841
"psubw %%mm3, %%mm5 \n\t"
3842
// pa = abs(p-a) = abs(pav)
3843
// pb = abs(p-b) = abs(pbv)
3844
// pc = abs(p-c) = abs(pcv)
3845
"pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
3846
"paddw %%mm5, %%mm6 \n\t"
3847
"pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
3848
"pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
3849
"psubw %%mm0, %%mm4 \n\t"
3850
"pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
3851
"psubw %%mm0, %%mm4 \n\t"
3852
"psubw %%mm7, %%mm5 \n\t"
3853
"pxor %%mm0, %%mm0 \n\t"
3854
"pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3855
"pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3856
"psubw %%mm7, %%mm5 \n\t"
3857
"psubw %%mm0, %%mm6 \n\t"
3859
"movq %%mm4, %%mm7 \n\t"
3860
"psubw %%mm0, %%mm6 \n\t"
3861
"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3862
"movq %%mm7, %%mm0 \n\t"
3863
// use mm7 mask to merge pa & pb
3864
"pand %%mm7, %%mm5 \n\t"
3865
// use mm0 mask copy to merge a & b
3866
"pand %%mm0, %%mm2 \n\t"
3867
"pandn %%mm4, %%mm7 \n\t"
3868
"pandn %%mm1, %%mm0 \n\t"
3869
"paddw %%mm5, %%mm7 \n\t"
3870
"paddw %%mm2, %%mm0 \n\t"
3871
// test ((pa <= pb)? pa:pb) <= pc
3872
"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3873
"pxor %%mm1, %%mm1 \n\t"
3874
"pand %%mm7, %%mm3 \n\t"
3875
"pandn %%mm0, %%mm7 \n\t"
3876
"paddw %%mm3, %%mm7 \n\t"
3877
"pxor %%mm0, %%mm0 \n\t"
3878
"packuswb %%mm1, %%mm7 \n\t"
3879
"movq -8(%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
3880
"pand _ActiveMask, %%mm7 \n\t"
3881
"psrlq _ShiftRem, %%mm3 \n\t"
3882
"movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x) step 1
3883
"paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor and Raw(x)
3884
"movq %%mm2, %%mm6 \n\t"
3885
"movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
3886
"movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3887
"psllq _ShiftBpp, %%mm6 \n\t"
3888
"movq %%mm7, %%mm5 \n\t"
3889
"psrlq _ShiftRem, %%mm1 \n\t"
3890
"por %%mm6, %%mm3 \n\t"
3891
"psllq _ShiftBpp, %%mm5 \n\t"
3892
"punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3893
"por %%mm5, %%mm1 \n\t"
3894
// do second set of 4 bytes
3895
"punpckhbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
3896
"punpckhbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
3897
// pav = p - a = (a + b - c) - a = b - c
3898
"movq %%mm2, %%mm4 \n\t"
3899
// pbv = p - b = (a + b - c) - b = a - c
3900
"movq %%mm1, %%mm5 \n\t"
3901
"psubw %%mm3, %%mm4 \n\t"
3902
"pxor %%mm7, %%mm7 \n\t"
3903
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3904
"movq %%mm4, %%mm6 \n\t"
3905
"psubw %%mm3, %%mm5 \n\t"
3906
// pa = abs(p-a) = abs(pav)
3907
// pb = abs(p-b) = abs(pbv)
3908
// pc = abs(p-c) = abs(pcv)
3909
"pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
3910
"paddw %%mm5, %%mm6 \n\t"
3911
"pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
3912
"pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
3913
"psubw %%mm0, %%mm4 \n\t"
3914
"pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
3915
"psubw %%mm0, %%mm4 \n\t"
3916
"psubw %%mm7, %%mm5 \n\t"
3917
"pxor %%mm0, %%mm0 \n\t"
3918
"pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3919
"pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3920
"psubw %%mm7, %%mm5 \n\t"
3921
"psubw %%mm0, %%mm6 \n\t"
3923
"movq %%mm4, %%mm7 \n\t"
3924
"psubw %%mm0, %%mm6 \n\t"
3925
"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3926
"movq %%mm7, %%mm0 \n\t"
3927
// use mm7 mask to merge pa & pb
3928
"pand %%mm7, %%mm5 \n\t"
3929
// use mm0 mask copy to merge a & b
3930
"pand %%mm0, %%mm2 \n\t"
3931
"pandn %%mm4, %%mm7 \n\t"
3932
"pandn %%mm1, %%mm0 \n\t"
3933
"paddw %%mm5, %%mm7 \n\t"
3934
"paddw %%mm2, %%mm0 \n\t"
3935
// test ((pa <= pb)? pa:pb) <= pc
3936
"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3937
"pxor %%mm1, %%mm1 \n\t"
3938
"pand %%mm7, %%mm3 \n\t"
3939
"pandn %%mm0, %%mm7 \n\t"
3940
"pxor %%mm1, %%mm1 \n\t"
3941
"paddw %%mm3, %%mm7 \n\t"
3942
"pxor %%mm0, %%mm0 \n\t"
3943
// step ecx to next set of 8 bytes and repeat loop til done
3944
"addl $8, %%ecx \n\t"
3945
"packuswb %%mm7, %%mm1 \n\t"
3946
"paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
3947
"cmpl _MMXLength, %%ecx \n\t"
3948
"movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
3949
// mm1 will be used as Raw(x-bpp) next loop
3952
: "=S" (dummy_value_S), // output regs (dummy)
3953
"=D" (dummy_value_D)
3955
: "0" (prev_row), // esi // input regs
3958
: "%ecx" // clobber list
3959
#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3960
, "%mm0", "%mm1", "%mm2", "%mm3"
3961
, "%mm4", "%mm5", "%mm6", "%mm7"
3969
_ActiveMask.use = 0x00000000ffffffffLL;
3971
__asm__ __volatile__ (
3972
"movl _dif, %%ecx \n\t"
3973
// preload "movl row, %%edi \n\t"
3974
// preload "movl prev_row, %%esi \n\t"
3975
"pxor %%mm0, %%mm0 \n\t"
3976
// prime the pump: load the first Raw(x-bpp) data set
3977
"movq -8(%%edi,%%ecx,), %%mm1 \n\t" // only time should need to read
3978
// a=Raw(x-bpp) bytes
3980
// do first set of 4 bytes
3981
"movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
3982
"punpckhbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
3983
"movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
3984
"punpcklbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
3985
// pav = p - a = (a + b - c) - a = b - c
3986
"movq %%mm2, %%mm4 \n\t"
3987
"punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3988
// pbv = p - b = (a + b - c) - b = a - c
3989
"movq %%mm1, %%mm5 \n\t"
3990
"psubw %%mm3, %%mm4 \n\t"
3991
"pxor %%mm7, %%mm7 \n\t"
3992
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3993
"movq %%mm4, %%mm6 \n\t"
3994
"psubw %%mm3, %%mm5 \n\t"
3995
// pa = abs(p-a) = abs(pav)
3996
// pb = abs(p-b) = abs(pbv)
3997
// pc = abs(p-c) = abs(pcv)
3998
"pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
3999
"paddw %%mm5, %%mm6 \n\t"
4000
"pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
4001
"pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
4002
"psubw %%mm0, %%mm4 \n\t"
4003
"pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
4004
"psubw %%mm0, %%mm4 \n\t"
4005
"psubw %%mm7, %%mm5 \n\t"
4006
"pxor %%mm0, %%mm0 \n\t"
4007
"pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
4008
"pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
4009
"psubw %%mm7, %%mm5 \n\t"
4010
"psubw %%mm0, %%mm6 \n\t"
4012
"movq %%mm4, %%mm7 \n\t"
4013
"psubw %%mm0, %%mm6 \n\t"
4014
"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
4015
"movq %%mm7, %%mm0 \n\t"
4016
// use mm7 mask to merge pa & pb
4017
"pand %%mm7, %%mm5 \n\t"
4018
// use mm0 mask copy to merge a & b
4019
"pand %%mm0, %%mm2 \n\t"
4020
"pandn %%mm4, %%mm7 \n\t"
4021
"pandn %%mm1, %%mm0 \n\t"
4022
"paddw %%mm5, %%mm7 \n\t"
4023
"paddw %%mm2, %%mm0 \n\t"
4024
// test ((pa <= pb)? pa:pb) <= pc
4025
"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
4026
"pxor %%mm1, %%mm1 \n\t"
4027
"pand %%mm7, %%mm3 \n\t"
4028
"pandn %%mm0, %%mm7 \n\t"
4029
"paddw %%mm3, %%mm7 \n\t"
4030
"pxor %%mm0, %%mm0 \n\t"
4031
"packuswb %%mm1, %%mm7 \n\t"
4032
"movq (%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
4033
"pand _ActiveMask, %%mm7 \n\t"
4034
"movq %%mm3, %%mm2 \n\t" // load b=Prior(x) step 1
4035
"paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
4036
"punpcklbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
4037
"movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
4038
"movq %%mm7, %%mm1 \n\t" // now mm1 will be used as Raw(x-bpp)
4039
// do second set of 4 bytes
4040
"punpckhbw %%mm0, %%mm2 \n\t" // unpack Low bytes of b
4041
"punpcklbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
4042
// pav = p - a = (a + b - c) - a = b - c
4043
"movq %%mm2, %%mm4 \n\t"
4044
// pbv = p - b = (a + b - c) - b = a - c
4045
"movq %%mm1, %%mm5 \n\t"
4046
"psubw %%mm3, %%mm4 \n\t"
4047
"pxor %%mm7, %%mm7 \n\t"
4048
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4049
"movq %%mm4, %%mm6 \n\t"
4050
"psubw %%mm3, %%mm5 \n\t"
4051
// pa = abs(p-a) = abs(pav)
4052
// pb = abs(p-b) = abs(pbv)
4053
// pc = abs(p-c) = abs(pcv)
4054
"pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
4055
"paddw %%mm5, %%mm6 \n\t"
4056
"pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
4057
"pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
4058
"psubw %%mm0, %%mm4 \n\t"
4059
"pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
4060
"psubw %%mm0, %%mm4 \n\t"
4061
"psubw %%mm7, %%mm5 \n\t"
4062
"pxor %%mm0, %%mm0 \n\t"
4063
"pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
4064
"pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
4065
"psubw %%mm7, %%mm5 \n\t"
4066
"psubw %%mm0, %%mm6 \n\t"
4068
"movq %%mm4, %%mm7 \n\t"
4069
"psubw %%mm0, %%mm6 \n\t"
4070
"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
4071
"movq %%mm7, %%mm0 \n\t"
4072
// use mm7 mask to merge pa & pb
4073
"pand %%mm7, %%mm5 \n\t"
4074
// use mm0 mask copy to merge a & b
4075
"pand %%mm0, %%mm2 \n\t"
4076
"pandn %%mm4, %%mm7 \n\t"
4077
"pandn %%mm1, %%mm0 \n\t"
4078
"paddw %%mm5, %%mm7 \n\t"
4079
"paddw %%mm2, %%mm0 \n\t"
4080
// test ((pa <= pb)? pa:pb) <= pc
4081
"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
4082
"pxor %%mm1, %%mm1 \n\t"
4083
"pand %%mm7, %%mm3 \n\t"
4084
"pandn %%mm0, %%mm7 \n\t"
4085
"pxor %%mm1, %%mm1 \n\t"
4086
"paddw %%mm3, %%mm7 \n\t"
4087
"pxor %%mm0, %%mm0 \n\t"
4088
// step ecx to next set of 8 bytes and repeat loop til done
4089
"addl $8, %%ecx \n\t"
4090
"packuswb %%mm7, %%mm1 \n\t"
4091
"paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add predictor with Raw(x)
4092
"cmpl _MMXLength, %%ecx \n\t"
4093
"movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
4094
// mm1 will be used as Raw(x-bpp) next loop
4097
: "=S" (dummy_value_S), // output regs (dummy)
4098
"=D" (dummy_value_D)
4100
: "0" (prev_row), // esi // input regs
4103
: "%ecx" // clobber list
4104
#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
4105
, "%mm0", "%mm1", "%mm2", "%mm3"
4106
, "%mm4", "%mm5", "%mm6", "%mm7"
4114
_ActiveMask.use = 0x00000000ffffffffLL;
4116
__asm__ __volatile__ (
4117
"movl _dif, %%ecx \n\t"
4118
// preload "movl row, %%edi \n\t"
4119
// preload "movl prev_row, %%esi \n\t"
4120
"pxor %%mm0, %%mm0 \n\t"
4121
// prime the pump: load the first Raw(x-bpp) data set
4122
"movq -8(%%edi,%%ecx,), %%mm1 \n\t" // only time should need to read
4123
// a=Raw(x-bpp) bytes
4125
// do first set of 4 bytes
4126
"movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
4127
"punpcklbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
4128
"movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
4129
"punpcklbw %%mm0, %%mm2 \n\t" // unpack Low bytes of b
4130
// pav = p - a = (a + b - c) - a = b - c
4131
"movq %%mm2, %%mm4 \n\t"
4132
"punpcklbw %%mm0, %%mm3 \n\t" // unpack Low bytes of c
4133
// pbv = p - b = (a + b - c) - b = a - c
4134
"movq %%mm1, %%mm5 \n\t"
4135
"psubw %%mm3, %%mm4 \n\t"
4136
"pxor %%mm7, %%mm7 \n\t"
4137
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4138
"movq %%mm4, %%mm6 \n\t"
4139
"psubw %%mm3, %%mm5 \n\t"
4140
// pa = abs(p-a) = abs(pav)
4141
// pb = abs(p-b) = abs(pbv)
4142
// pc = abs(p-c) = abs(pcv)
4143
"pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
4144
"paddw %%mm5, %%mm6 \n\t"
4145
"pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
4146
"pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
4147
"psubw %%mm0, %%mm4 \n\t"
4148
"pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
4149
"psubw %%mm0, %%mm4 \n\t"
4150
"psubw %%mm7, %%mm5 \n\t"
4151
"pxor %%mm0, %%mm0 \n\t"
4152
"pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
4153
"pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
4154
"psubw %%mm7, %%mm5 \n\t"
4155
"psubw %%mm0, %%mm6 \n\t"
4157
"movq %%mm4, %%mm7 \n\t"
4158
"psubw %%mm0, %%mm6 \n\t"
4159
"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
4160
"movq %%mm7, %%mm0 \n\t"
4161
// use mm7 mask to merge pa & pb
4162
"pand %%mm7, %%mm5 \n\t"
4163
// use mm0 mask copy to merge a & b
4164
"pand %%mm0, %%mm2 \n\t"
4165
"pandn %%mm4, %%mm7 \n\t"
4166
"pandn %%mm1, %%mm0 \n\t"
4167
"paddw %%mm5, %%mm7 \n\t"
4168
"paddw %%mm2, %%mm0 \n\t"
4169
// test ((pa <= pb)? pa:pb) <= pc
4170
"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
4171
"pxor %%mm1, %%mm1 \n\t"
4172
"pand %%mm7, %%mm3 \n\t"
4173
"pandn %%mm0, %%mm7 \n\t"
4174
"paddw %%mm3, %%mm7 \n\t"
4175
"pxor %%mm0, %%mm0 \n\t"
4176
"packuswb %%mm1, %%mm7 \n\t"
4177
"movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
4178
"pand _ActiveMask, %%mm7 \n\t"
4179
"movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
4180
"paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
4181
"punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
4182
"movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
4183
"movq -8(%%edi,%%ecx,), %%mm1 \n\t" // read a=Raw(x-bpp) bytes
4185
// do second set of 4 bytes
4186
"punpckhbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
4187
"punpckhbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
4188
// pav = p - a = (a + b - c) - a = b - c
4189
"movq %%mm2, %%mm4 \n\t"
4190
// pbv = p - b = (a + b - c) - b = a - c
4191
"movq %%mm1, %%mm5 \n\t"
4192
"psubw %%mm3, %%mm4 \n\t"
4193
"pxor %%mm7, %%mm7 \n\t"
4194
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4195
"movq %%mm4, %%mm6 \n\t"
4196
"psubw %%mm3, %%mm5 \n\t"
4197
// pa = abs(p-a) = abs(pav)
4198
// pb = abs(p-b) = abs(pbv)
4199
// pc = abs(p-c) = abs(pcv)
4200
"pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
4201
"paddw %%mm5, %%mm6 \n\t"
4202
"pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
4203
"pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
4204
"psubw %%mm0, %%mm4 \n\t"
4205
"pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
4206
"psubw %%mm0, %%mm4 \n\t"
4207
"psubw %%mm7, %%mm5 \n\t"
4208
"pxor %%mm0, %%mm0 \n\t"
4209
"pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
4210
"pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
4211
"psubw %%mm7, %%mm5 \n\t"
4212
"psubw %%mm0, %%mm6 \n\t"
4214
"movq %%mm4, %%mm7 \n\t"
4215
"psubw %%mm0, %%mm6 \n\t"
4216
"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
4217
"movq %%mm7, %%mm0 \n\t"
4218
// use mm7 mask to merge pa & pb
4219
"pand %%mm7, %%mm5 \n\t"
4220
// use mm0 mask copy to merge a & b
4221
"pand %%mm0, %%mm2 \n\t"
4222
"pandn %%mm4, %%mm7 \n\t"
4223
"pandn %%mm1, %%mm0 \n\t"
4224
"paddw %%mm5, %%mm7 \n\t"
4225
"paddw %%mm2, %%mm0 \n\t"
4226
// test ((pa <= pb)? pa:pb) <= pc
4227
"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
4228
"pxor %%mm1, %%mm1 \n\t"
4229
"pand %%mm7, %%mm3 \n\t"
4230
"pandn %%mm0, %%mm7 \n\t"
4231
"pxor %%mm1, %%mm1 \n\t"
4232
"paddw %%mm3, %%mm7 \n\t"
4233
"pxor %%mm0, %%mm0 \n\t"
4234
// step ecx to next set of 8 bytes and repeat loop til done
4235
"addl $8, %%ecx \n\t"
4236
"packuswb %%mm7, %%mm1 \n\t"
4237
"paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
4238
"cmpl _MMXLength, %%ecx \n\t"
4239
"movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
4240
// mm1 will be used as Raw(x-bpp) next loop
4243
: "=S" (dummy_value_S), // output regs (dummy)
4244
"=D" (dummy_value_D)
4246
: "0" (prev_row), // esi // input regs
4249
: "%ecx" // clobber list
4250
#if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
4251
, "%mm0", "%mm1", "%mm2", "%mm3"
4252
, "%mm4", "%mm5", "%mm6", "%mm7"
4262
__asm__ __volatile__ (
4264
"pushl %%ebx \n\t" // save Global Offset Table index
4266
"movl _dif, %%ebx \n\t"
4267
"cmpl _FullLength, %%ebx \n\t"
4268
"jnb paeth_dend \n\t"
4270
// preload "movl row, %%edi \n\t"
4271
// preload "movl prev_row, %%esi \n\t"
4272
// do Paeth decode for remaining bytes
4273
"movl %%ebx, %%edx \n\t"
4274
// preload "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
4275
"subl %%ecx, %%edx \n\t" // edx = ebx - bpp
4276
"xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx
4279
"xorl %%eax, %%eax \n\t"
4280
// pav = p - a = (a + b - c) - a = b - c
4281
"movb (%%esi,%%ebx,), %%al \n\t" // load Prior(x) into al
4282
"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4283
"subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
4284
"movl %%eax, _patemp \n\t" // Save pav for later use
4285
"xorl %%eax, %%eax \n\t"
4286
// pbv = p - b = (a + b - c) - b = a - c
4287
"movb (%%edi,%%edx,), %%al \n\t" // load Raw(x-bpp) into al
4288
"subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
4289
"movl %%eax, %%ecx \n\t"
4290
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4291
"addl _patemp, %%eax \n\t" // pcv = pav + pbv
4293
"testl $0x80000000, %%eax \n\t"
4294
"jz paeth_dpca \n\t"
4295
"negl %%eax \n\t" // reverse sign of neg values
4298
"movl %%eax, _pctemp \n\t" // save pc for later use
4300
"testl $0x80000000, %%ecx \n\t"
4301
"jz paeth_dpba \n\t"
4302
"negl %%ecx \n\t" // reverse sign of neg values
4305
"movl %%ecx, _pbtemp \n\t" // save pb for later use
4307
"movl _patemp, %%eax \n\t"
4308
"testl $0x80000000, %%eax \n\t"
4309
"jz paeth_dpaa \n\t"
4310
"negl %%eax \n\t" // reverse sign of neg values
4313
"movl %%eax, _patemp \n\t" // save pa for later use
4315
"cmpl %%ecx, %%eax \n\t"
4316
"jna paeth_dabb \n\t"
4317
// pa > pb; now test if pb <= pc
4318
"cmpl _pctemp, %%ecx \n\t"
4319
"jna paeth_dbbc \n\t"
4320
// pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4321
"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4322
"jmp paeth_dpaeth \n\t"
4325
// pb <= pc; Raw(x) = Paeth(x) + Prior(x)
4326
"movb (%%esi,%%ebx,), %%cl \n\t" // load Prior(x) into cl
4327
"jmp paeth_dpaeth \n\t"
4330
// pa <= pb; now test if pa <= pc
4331
"cmpl _pctemp, %%eax \n\t"
4332
"jna paeth_dabc \n\t"
4333
// pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4334
"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4335
"jmp paeth_dpaeth \n\t"
4338
// pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
4339
"movb (%%edi,%%edx,), %%cl \n\t" // load Raw(x-bpp) into cl
4341
"paeth_dpaeth: \n\t"
4344
// Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
4345
"addb %%cl, -1(%%edi,%%ebx,) \n\t"
4346
"cmpl _FullLength, %%ebx \n\t"
4351
"popl %%ebx \n\t" // index to Global Offset Table
4354
: "=c" (dummy_value_c), // output regs (dummy)
4355
"=S" (dummy_value_S),
4356
"=D" (dummy_value_D)
4358
: "0" (bpp), // ecx // input regs
4359
"1" (prev_row), // esi
4362
: "%eax", "%edx" // clobber list
4368
return; // No need to go further with this one
4370
} // end switch (bpp)
4372
__asm__ __volatile__ (
4373
// MMX acceleration complete; now do clean-up
4374
// check if any remaining bytes left to decode
4376
"pushl %%ebx \n\t" // save index to Global Offset Table
4378
"movl _MMXLength, %%ebx \n\t"
4379
"cmpl _FullLength, %%ebx \n\t"
4380
"jnb paeth_end \n\t"
4381
//pre "movl row, %%edi \n\t"
4382
//pre "movl prev_row, %%esi \n\t"
4383
// do Paeth decode for remaining bytes
4384
"movl %%ebx, %%edx \n\t"
4385
//pre "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
4386
"subl %%ecx, %%edx \n\t" // edx = ebx - bpp
4387
"xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx below
4390
"xorl %%eax, %%eax \n\t"
4391
// pav = p - a = (a + b - c) - a = b - c
4392
"movb (%%esi,%%ebx,), %%al \n\t" // load Prior(x) into al
4393
"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4394
"subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
4395
"movl %%eax, _patemp \n\t" // Save pav for later use
4396
"xorl %%eax, %%eax \n\t"
4397
// pbv = p - b = (a + b - c) - b = a - c
4398
"movb (%%edi,%%edx,), %%al \n\t" // load Raw(x-bpp) into al
4399
"subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
4400
"movl %%eax, %%ecx \n\t"
4401
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4402
"addl _patemp, %%eax \n\t" // pcv = pav + pbv
4404
"testl $0x80000000, %%eax \n\t"
4405
"jz paeth_pca2 \n\t"
4406
"negl %%eax \n\t" // reverse sign of neg values
4409
"movl %%eax, _pctemp \n\t" // save pc for later use
4411
"testl $0x80000000, %%ecx \n\t"
4412
"jz paeth_pba2 \n\t"
4413
"negl %%ecx \n\t" // reverse sign of neg values
4416
"movl %%ecx, _pbtemp \n\t" // save pb for later use
4418
"movl _patemp, %%eax \n\t"
4419
"testl $0x80000000, %%eax \n\t"
4420
"jz paeth_paa2 \n\t"
4421
"negl %%eax \n\t" // reverse sign of neg values
4424
"movl %%eax, _patemp \n\t" // save pa for later use
4426
"cmpl %%ecx, %%eax \n\t"
4427
"jna paeth_abb2 \n\t"
4428
// pa > pb; now test if pb <= pc
4429
"cmpl _pctemp, %%ecx \n\t"
4430
"jna paeth_bbc2 \n\t"
4431
// pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4432
"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4433
"jmp paeth_paeth2 \n\t"
4436
// pb <= pc; Raw(x) = Paeth(x) + Prior(x)
4437
"movb (%%esi,%%ebx,), %%cl \n\t" // load Prior(x) into cl
4438
"jmp paeth_paeth2 \n\t"
4441
// pa <= pb; now test if pa <= pc
4442
"cmpl _pctemp, %%eax \n\t"
4443
"jna paeth_abc2 \n\t"
4444
// pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4445
"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4446
"jmp paeth_paeth2 \n\t"
4449
// pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
4450
"movb (%%edi,%%edx,), %%cl \n\t" // load Raw(x-bpp) into cl
4452
"paeth_paeth2: \n\t"
4455
// Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
4456
"addb %%cl, -1(%%edi,%%ebx,) \n\t"
4457
"cmpl _FullLength, %%ebx \n\t"
4461
"EMMS \n\t" // end MMX; prep for poss. FP instrs.
4463
"popl %%ebx \n\t" // restore index to Global Offset Table
4466
: "=c" (dummy_value_c), // output regs (dummy)
4467
"=S" (dummy_value_S),
4468
"=D" (dummy_value_D)
4470
: "0" (bpp), // ecx // input regs
4471
"1" (prev_row), // esi
4474
: "%eax", "%edx" // clobber list (no input regs!)
4480
} /* end png_read_filter_row_mmx_paeth() */
4486
#ifdef PNG_THREAD_UNSAFE_OK
4487
//===========================================================================//
4489
// P N G _ R E A D _ F I L T E R _ R O W _ M M X _ S U B //
4491
//===========================================================================//
4493
// Optimized code for PNG Sub filter decoder
4495
static void /* PRIVATE */
4496
png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
4502
bpp = (row_info->pixel_depth + 7) >> 3; // calc number of bytes per pixel
4503
_FullLength = row_info->rowbytes - bpp; // number of bytes to filter
4505
__asm__ __volatile__ (
4506
//pre "movl row, %%edi \n\t"
4507
"movl %%edi, %%esi \n\t" // lp = row
4508
//pre "movl bpp, %%eax \n\t"
4509
"addl %%eax, %%edi \n\t" // rp = row + bpp
4510
//irr "xorl %%eax, %%eax \n\t"
4511
// get # of bytes to alignment
4512
"movl %%edi, _dif \n\t" // take start of row
4513
"addl $0xf, _dif \n\t" // add 7 + 8 to incr past
4514
// alignment boundary
4515
"xorl %%ecx, %%ecx \n\t"
4516
"andl $0xfffffff8, _dif \n\t" // mask to alignment boundary
4517
"subl %%edi, _dif \n\t" // subtract from start ==> value
4518
"jz sub_go \n\t" // ecx at alignment
4520
"sub_lp1: \n\t" // fix alignment
4521
"movb (%%esi,%%ecx,), %%al \n\t"
4522
"addb %%al, (%%edi,%%ecx,) \n\t"
4524
"cmpl _dif, %%ecx \n\t"
4528
"movl _FullLength, %%eax \n\t"
4529
"movl %%eax, %%edx \n\t"
4530
"subl %%ecx, %%edx \n\t" // subtract alignment fix
4531
"andl $0x00000007, %%edx \n\t" // calc bytes over mult of 8
4532
"subl %%edx, %%eax \n\t" // drop over bytes from length
4533
"movl %%eax, _MMXLength \n\t"
4535
: "=a" (dummy_value_a), // 0 // output regs (dummy)
4536
"=D" (dummy_value_D) // 1
4538
: "0" (bpp), // eax // input regs
4541
: "%esi", "%ecx", "%edx" // clobber list
4543
#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4544
, "%mm0", "%mm1", "%mm2", "%mm3"
4545
, "%mm4", "%mm5", "%mm6", "%mm7"
4549
// now do the math for the rest of the row
4554
_ActiveMask.use = 0x0000ffffff000000LL;
4555
_ShiftBpp.use = 24; // == 3 * 8
4556
_ShiftRem.use = 40; // == 64 - 24
4558
__asm__ __volatile__ (
4559
// preload "movl row, %%edi \n\t"
4560
"movq _ActiveMask, %%mm7 \n\t" // load _ActiveMask for 2nd
4561
// active byte group
4562
"movl %%edi, %%esi \n\t" // lp = row
4563
// preload "movl bpp, %%eax \n\t"
4564
"addl %%eax, %%edi \n\t" // rp = row + bpp
4565
"movq %%mm7, %%mm6 \n\t"
4566
"movl _dif, %%edx \n\t"
4567
"psllq _ShiftBpp, %%mm6 \n\t" // move mask in mm6 to cover
4568
// 3rd active byte group
4569
// prime the pump: load the first Raw(x-bpp) data set
4570
"movq -8(%%edi,%%edx,), %%mm1 \n\t"
4572
"sub_3lp: \n\t" // shift data for adding first
4573
"psrlq _ShiftRem, %%mm1 \n\t" // bpp bytes (no need for mask;
4574
// shift clears inactive bytes)
4575
// add 1st active group
4576
"movq (%%edi,%%edx,), %%mm0 \n\t"
4577
"paddb %%mm1, %%mm0 \n\t"
4579
// add 2nd active group
4580
"movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4581
"psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4582
"pand %%mm7, %%mm1 \n\t" // mask to use 2nd active group
4583
"paddb %%mm1, %%mm0 \n\t"
4585
// add 3rd active group
4586
"movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4587
"psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4588
"pand %%mm6, %%mm1 \n\t" // mask to use 3rd active group
4589
"addl $8, %%edx \n\t"
4590
"paddb %%mm1, %%mm0 \n\t"
4592
"cmpl _MMXLength, %%edx \n\t"
4593
"movq %%mm0, -8(%%edi,%%edx,) \n\t" // write updated Raws to array
4594
"movq %%mm0, %%mm1 \n\t" // prep 1st add at top of loop
4597
: "=a" (dummy_value_a), // 0 // output regs (dummy)
4598
"=D" (dummy_value_D) // 1
4600
: "0" (bpp), // eax // input regs
4603
: "%edx", "%esi" // clobber list
4604
#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4605
, "%mm0", "%mm1", "%mm6", "%mm7"
4613
__asm__ __volatile__ (
4614
"movl _dif, %%edx \n\t"
4615
// preload "movl row, %%edi \n\t"
4616
"cmpl _FullLength, %%edx \n\t"
4618
"movl %%edi, %%esi \n\t" // lp = row
4619
"xorl %%eax, %%eax \n\t"
4620
// preload "movl bpp, %%eax \n\t"
4621
"addl %%eax, %%edi \n\t" // rp = row + bpp
4624
"movb (%%esi,%%edx,), %%al \n\t"
4625
"addb %%al, (%%edi,%%edx,) \n\t"
4627
"cmpl _FullLength, %%edx \n\t"
4632
: "=a" (dummy_value_a), // 0 // output regs (dummy)
4633
"=D" (dummy_value_D) // 1
4635
: "0" (bpp), // eax // input regs
4638
: "%edx", "%esi" // clobber list
4645
//case 7: // GRR BOGUS
4646
//case 5: // GRR BOGUS
4648
_ShiftBpp.use = bpp << 3;
4649
_ShiftRem.use = 64 - _ShiftBpp.use;
4651
__asm__ __volatile__ (
4652
// preload "movl row, %%edi \n\t"
4653
"movl _dif, %%edx \n\t"
4654
"movl %%edi, %%esi \n\t" // lp = row
4655
// preload "movl bpp, %%eax \n\t"
4656
"addl %%eax, %%edi \n\t" // rp = row + bpp
4658
// prime the pump: load the first Raw(x-bpp) data set
4659
"movq -8(%%edi,%%edx,), %%mm1 \n\t"
4661
"sub_4lp: \n\t" // shift data for adding first
4662
"psrlq _ShiftRem, %%mm1 \n\t" // bpp bytes (no need for mask;
4663
// shift clears inactive bytes)
4664
"movq (%%edi,%%edx,), %%mm0 \n\t"
4665
"paddb %%mm1, %%mm0 \n\t"
4667
// add 2nd active group
4668
"movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4669
"psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4670
"addl $8, %%edx \n\t"
4671
"paddb %%mm1, %%mm0 \n\t"
4673
"cmpl _MMXLength, %%edx \n\t"
4674
"movq %%mm0, -8(%%edi,%%edx,) \n\t"
4675
"movq %%mm0, %%mm1 \n\t" // prep 1st add at top of loop
4678
: "=a" (dummy_value_a), // 0 // output regs (dummy)
4679
"=D" (dummy_value_D) // 1
4681
: "0" (bpp), // eax // input regs
4684
: "%edx", "%esi" // clobber list
4685
#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4694
_ActiveMask.use = 0x00000000ffff0000LL;
4695
_ShiftBpp.use = 16; // == 2 * 8
4696
_ShiftRem.use = 48; // == 64 - 16
4698
__asm__ __volatile__ (
4699
"movq _ActiveMask, %%mm7 \n\t" // load _ActiveMask for 2nd
4700
// active byte group
4701
"movl _dif, %%edx \n\t"
4702
"movq %%mm7, %%mm6 \n\t"
4703
// preload "movl row, %%edi \n\t"
4704
"psllq _ShiftBpp, %%mm6 \n\t" // move mask in mm6 to cover
4705
// 3rd active byte group
4706
"movl %%edi, %%esi \n\t" // lp = row
4707
"movq %%mm6, %%mm5 \n\t"
4708
// preload "movl bpp, %%eax \n\t"
4709
"addl %%eax, %%edi \n\t" // rp = row + bpp
4710
"psllq _ShiftBpp, %%mm5 \n\t" // move mask in mm5 to cover
4711
// 4th active byte group
4712
// prime the pump: load the first Raw(x-bpp) data set
4713
"movq -8(%%edi,%%edx,), %%mm1 \n\t"
4715
"sub_2lp: \n\t" // shift data for adding first
4716
"psrlq _ShiftRem, %%mm1 \n\t" // bpp bytes (no need for mask;
4717
// shift clears inactive bytes)
4718
// add 1st active group
4719
"movq (%%edi,%%edx,), %%mm0 \n\t"
4720
"paddb %%mm1, %%mm0 \n\t"
4722
// add 2nd active group
4723
"movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4724
"psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4725
"pand %%mm7, %%mm1 \n\t" // mask to use 2nd active group
4726
"paddb %%mm1, %%mm0 \n\t"
4728
// add 3rd active group
4729
"movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4730
"psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4731
"pand %%mm6, %%mm1 \n\t" // mask to use 3rd active group
4732
"paddb %%mm1, %%mm0 \n\t"
4734
// add 4th active group
4735
"movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4736
"psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4737
"pand %%mm5, %%mm1 \n\t" // mask to use 4th active group
4738
"addl $8, %%edx \n\t"
4739
"paddb %%mm1, %%mm0 \n\t"
4740
"cmpl _MMXLength, %%edx \n\t"
4741
"movq %%mm0, -8(%%edi,%%edx,) \n\t" // write updated Raws to array
4742
"movq %%mm0, %%mm1 \n\t" // prep 1st add at top of loop
4745
: "=a" (dummy_value_a), // 0 // output regs (dummy)
4746
"=D" (dummy_value_D) // 1
4748
: "0" (bpp), // eax // input regs
4751
: "%edx", "%esi" // clobber list
4752
#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4753
, "%mm0", "%mm1", "%mm5", "%mm6", "%mm7"
4761
__asm__ __volatile__ (
4762
// preload "movl row, %%edi \n\t"
4763
"movl _dif, %%edx \n\t"
4764
"movl %%edi, %%esi \n\t" // lp = row
4765
// preload "movl bpp, %%eax \n\t"
4766
"addl %%eax, %%edi \n\t" // rp = row + bpp
4767
"movl _MMXLength, %%ecx \n\t"
4769
// prime the pump: load the first Raw(x-bpp) data set
4770
"movq -8(%%edi,%%edx,), %%mm7 \n\t"
4771
"andl $0x0000003f, %%ecx \n\t" // calc bytes over mult of 64
4774
"movq (%%edi,%%edx,), %%mm0 \n\t" // load Sub(x) for 1st 8 bytes
4775
"paddb %%mm7, %%mm0 \n\t"
4776
"movq 8(%%edi,%%edx,), %%mm1 \n\t" // load Sub(x) for 2nd 8 bytes
4777
"movq %%mm0, (%%edi,%%edx,) \n\t" // write Raw(x) for 1st 8 bytes
4779
// Now mm0 will be used as Raw(x-bpp) for the 2nd group of 8 bytes.
4780
// This will be repeated for each group of 8 bytes with the 8th
4781
// group being used as the Raw(x-bpp) for the 1st group of the
4784
"paddb %%mm0, %%mm1 \n\t"
4785
"movq 16(%%edi,%%edx,), %%mm2 \n\t" // load Sub(x) for 3rd 8 bytes
4786
"movq %%mm1, 8(%%edi,%%edx,) \n\t" // write Raw(x) for 2nd 8 bytes
4787
"paddb %%mm1, %%mm2 \n\t"
4788
"movq 24(%%edi,%%edx,), %%mm3 \n\t" // load Sub(x) for 4th 8 bytes
4789
"movq %%mm2, 16(%%edi,%%edx,) \n\t" // write Raw(x) for 3rd 8 bytes
4790
"paddb %%mm2, %%mm3 \n\t"
4791
"movq 32(%%edi,%%edx,), %%mm4 \n\t" // load Sub(x) for 5th 8 bytes
4792
"movq %%mm3, 24(%%edi,%%edx,) \n\t" // write Raw(x) for 4th 8 bytes
4793
"paddb %%mm3, %%mm4 \n\t"
4794
"movq 40(%%edi,%%edx,), %%mm5 \n\t" // load Sub(x) for 6th 8 bytes
4795
"movq %%mm4, 32(%%edi,%%edx,) \n\t" // write Raw(x) for 5th 8 bytes
4796
"paddb %%mm4, %%mm5 \n\t"
4797
"movq 48(%%edi,%%edx,), %%mm6 \n\t" // load Sub(x) for 7th 8 bytes
4798
"movq %%mm5, 40(%%edi,%%edx,) \n\t" // write Raw(x) for 6th 8 bytes
4799
"paddb %%mm5, %%mm6 \n\t"
4800
"movq 56(%%edi,%%edx,), %%mm7 \n\t" // load Sub(x) for 8th 8 bytes
4801
"movq %%mm6, 48(%%edi,%%edx,) \n\t" // write Raw(x) for 7th 8 bytes
4802
"addl $64, %%edx \n\t"
4803
"paddb %%mm6, %%mm7 \n\t"
4804
"cmpl %%ecx, %%edx \n\t"
4805
"movq %%mm7, -8(%%edi,%%edx,) \n\t" // write Raw(x) for 8th 8 bytes
4808
"cmpl _MMXLength, %%edx \n\t"
4812
"movq (%%edi,%%edx,), %%mm0 \n\t"
4813
"addl $8, %%edx \n\t"
4814
"paddb %%mm7, %%mm0 \n\t"
4815
"cmpl _MMXLength, %%edx \n\t"
4816
"movq %%mm0, -8(%%edi,%%edx,) \n\t" // -8 to offset early addl edx
4817
"movq %%mm0, %%mm7 \n\t" // move calculated Raw(x) data
4818
// to mm1 to be new Raw(x-bpp)
4824
: "=a" (dummy_value_a), // 0 // output regs (dummy)
4825
"=D" (dummy_value_D) // 1
4827
: "0" (bpp), // eax // input regs
4830
: "%ecx", "%edx", "%esi" // clobber list
4831
#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4832
, "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"
4838
default: // bpp greater than 8 bytes GRR BOGUS
4840
__asm__ __volatile__ (
4841
"movl _dif, %%edx \n\t"
4842
// preload "movl row, %%edi \n\t"
4843
"movl %%edi, %%esi \n\t" // lp = row
4844
// preload "movl bpp, %%eax \n\t"
4845
"addl %%eax, %%edi \n\t" // rp = row + bpp
4848
"movq (%%edi,%%edx,), %%mm0 \n\t"
4849
"movq (%%esi,%%edx,), %%mm1 \n\t"
4850
"addl $8, %%edx \n\t"
4851
"paddb %%mm1, %%mm0 \n\t"
4852
"cmpl _MMXLength, %%edx \n\t"
4853
"movq %%mm0, -8(%%edi,%%edx,) \n\t" // mov does not affect flags;
4854
// -8 to offset addl edx
4857
: "=a" (dummy_value_a), // 0 // output regs (dummy)
4858
"=D" (dummy_value_D) // 1
4860
: "0" (bpp), // eax // input regs
4863
: "%edx", "%esi" // clobber list
4864
#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4871
} // end switch (bpp)
4873
__asm__ __volatile__ (
4874
"movl _MMXLength, %%edx \n\t"
4875
//pre "movl row, %%edi \n\t"
4876
"cmpl _FullLength, %%edx \n\t"
4879
"movl %%edi, %%esi \n\t" // lp = row
4880
//pre "movl bpp, %%eax \n\t"
4881
"addl %%eax, %%edi \n\t" // rp = row + bpp
4882
"xorl %%eax, %%eax \n\t"
4885
"movb (%%esi,%%edx,), %%al \n\t"
4886
"addb %%al, (%%edi,%%edx,) \n\t"
4888
"cmpl _FullLength, %%edx \n\t"
4892
"EMMS \n\t" // end MMX instructions
4894
: "=a" (dummy_value_a), // 0 // output regs (dummy)
4895
"=D" (dummy_value_D) // 1
4897
: "0" (bpp), // eax // input regs
4900
: "%edx", "%esi" // clobber list
4903
} // end of png_read_filter_row_mmx_sub()
4909
//===========================================================================//
4911
// P N G _ R E A D _ F I L T E R _ R O W _ M M X _ U P //
4913
//===========================================================================//
4915
// Optimized code for PNG Up filter decoder
4917
static void /* PRIVATE */
4918
png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
4922
int dummy_value_d; // fix 'forbidden register 3 (dx) was spilled' error
4926
len = row_info->rowbytes; // number of bytes to filter
4928
__asm__ __volatile__ (
4929
//pre "movl row, %%edi \n\t"
4930
// get # of bytes to alignment
4934
"movl %%edi, %%ecx \n\t"
4935
"xorl %%ebx, %%ebx \n\t"
4936
"addl $0x7, %%ecx \n\t"
4937
"xorl %%eax, %%eax \n\t"
4938
"andl $0xfffffff8, %%ecx \n\t"
4939
//pre "movl prev_row, %%esi \n\t"
4940
"subl %%edi, %%ecx \n\t"
4943
"up_lp1: \n\t" // fix alignment
4944
"movb (%%edi,%%ebx,), %%al \n\t"
4945
"addb (%%esi,%%ebx,), %%al \n\t"
4947
"cmpl %%ecx, %%ebx \n\t"
4948
"movb %%al, -1(%%edi,%%ebx,) \n\t" // mov does not affect flags; -1 to
4949
"jb up_lp1 \n\t" // offset incl ebx
4952
//pre "movl len, %%edx \n\t"
4953
"movl %%edx, %%ecx \n\t"
4954
"subl %%ebx, %%edx \n\t" // subtract alignment fix
4955
"andl $0x0000003f, %%edx \n\t" // calc bytes over mult of 64
4956
"subl %%edx, %%ecx \n\t" // drop over bytes from length
4958
// unrolled loop - use all MMX registers and interleave to reduce
4959
// number of branch instructions (loops) and reduce partial stalls
4961
"movq (%%esi,%%ebx,), %%mm1 \n\t"
4962
"movq (%%edi,%%ebx,), %%mm0 \n\t"
4963
"movq 8(%%esi,%%ebx,), %%mm3 \n\t"
4964
"paddb %%mm1, %%mm0 \n\t"
4965
"movq 8(%%edi,%%ebx,), %%mm2 \n\t"
4966
"movq %%mm0, (%%edi,%%ebx,) \n\t"
4967
"paddb %%mm3, %%mm2 \n\t"
4968
"movq 16(%%esi,%%ebx,), %%mm5 \n\t"
4969
"movq %%mm2, 8(%%edi,%%ebx,) \n\t"
4970
"movq 16(%%edi,%%ebx,), %%mm4 \n\t"
4971
"movq 24(%%esi,%%ebx,), %%mm7 \n\t"
4972
"paddb %%mm5, %%mm4 \n\t"
4973
"movq 24(%%edi,%%ebx,), %%mm6 \n\t"
4974
"movq %%mm4, 16(%%edi,%%ebx,) \n\t"
4975
"paddb %%mm7, %%mm6 \n\t"
4976
"movq 32(%%esi,%%ebx,), %%mm1 \n\t"
4977
"movq %%mm6, 24(%%edi,%%ebx,) \n\t"
4978
"movq 32(%%edi,%%ebx,), %%mm0 \n\t"
4979
"movq 40(%%esi,%%ebx,), %%mm3 \n\t"
4980
"paddb %%mm1, %%mm0 \n\t"
4981
"movq 40(%%edi,%%ebx,), %%mm2 \n\t"
4982
"movq %%mm0, 32(%%edi,%%ebx,) \n\t"
4983
"paddb %%mm3, %%mm2 \n\t"
4984
"movq 48(%%esi,%%ebx,), %%mm5 \n\t"
4985
"movq %%mm2, 40(%%edi,%%ebx,) \n\t"
4986
"movq 48(%%edi,%%ebx,), %%mm4 \n\t"
4987
"movq 56(%%esi,%%ebx,), %%mm7 \n\t"
4988
"paddb %%mm5, %%mm4 \n\t"
4989
"movq 56(%%edi,%%ebx,), %%mm6 \n\t"
4990
"movq %%mm4, 48(%%edi,%%ebx,) \n\t"
4991
"addl $64, %%ebx \n\t"
4992
"paddb %%mm7, %%mm6 \n\t"
4993
"cmpl %%ecx, %%ebx \n\t"
4994
"movq %%mm6, -8(%%edi,%%ebx,) \n\t" // (+56)movq does not affect flags;
4995
"jb up_loop \n\t" // -8 to offset addl ebx
4997
"cmpl $0, %%edx \n\t" // test for bytes over mult of 64
5000
"cmpl $8, %%edx \n\t" // test for less than 8 bytes
5001
"jb up_lt8 \n\t" // [added by lcreeve at netins.net]
5003
"addl %%edx, %%ecx \n\t"
5004
"andl $0x00000007, %%edx \n\t" // calc bytes over mult of 8
5005
"subl %%edx, %%ecx \n\t" // drop over bytes from length
5008
"up_lpA: \n\t" // use MMX regs to update 8 bytes sim.
5009
"movq (%%esi,%%ebx,), %%mm1 \n\t"
5010
"movq (%%edi,%%ebx,), %%mm0 \n\t"
5011
"addl $8, %%ebx \n\t"
5012
"paddb %%mm1, %%mm0 \n\t"
5013
"cmpl %%ecx, %%ebx \n\t"
5014
"movq %%mm0, -8(%%edi,%%ebx,) \n\t" // movq does not affect flags; -8 to
5015
"jb up_lpA \n\t" // offset add ebx
5016
"cmpl $0, %%edx \n\t" // test for bytes over mult of 8
5020
"xorl %%eax, %%eax \n\t"
5021
"addl %%edx, %%ecx \n\t" // move over byte count into counter
5023
"up_lp2: \n\t" // use x86 regs for remaining bytes
5024
"movb (%%edi,%%ebx,), %%al \n\t"
5025
"addb (%%esi,%%ebx,), %%al \n\t"
5027
"cmpl %%ecx, %%ebx \n\t"
5028
"movb %%al, -1(%%edi,%%ebx,) \n\t" // mov does not affect flags; -1 to
5029
"jb up_lp2 \n\t" // offset inc ebx
5032
"EMMS \n\t" // conversion of filtered row complete
5037
: "=d" (dummy_value_d), // 0 // output regs (dummy)
5038
"=S" (dummy_value_S), // 1
5039
"=D" (dummy_value_D) // 2
5041
: "0" (len), // edx // input regs
5042
"1" (prev_row), // esi
5045
: "%eax", "%ecx" // clobber list (no input regs!)
5050
#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
5051
, "%mm0", "%mm1", "%mm2", "%mm3"
5052
, "%mm4", "%mm5", "%mm6", "%mm7"
5056
} // end of png_read_filter_row_mmx_up()
5058
#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5063
/*===========================================================================*/
5065
/* P N G _ R E A D _ F I L T E R _ R O W */
5067
/*===========================================================================*/
5070
/* Optimized png_read_filter_row routines */
5073
png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
5074
row, png_bytep prev_row, int filter)
5080
#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
5081
/* GRR: these are superseded by png_ptr->asm_flags: */
5082
#define UseMMX_sub 1 // GRR: converted 20000730
5083
#define UseMMX_up 1 // GRR: converted 20000729
5084
#define UseMMX_avg 1 // GRR: converted 20000828 (+ 16-bit bugfix 20000916)
5085
#define UseMMX_paeth 1 // GRR: converted 20000828
5087
if (_mmx_supported == 2) {
5088
/* this should have happened in png_init_mmx_flags() already */
5089
#if !defined(PNG_1_0_X)
5090
png_warning(png_ptr, "asm_flags may not have been initialized");
5094
#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5097
png_debug(1, "in png_read_filter_row (pnggccrd.c)\n");
5100
case 0: sprintf(filnm, "none");
5102
case 1: sprintf(filnm, "sub-%s",
5103
#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5104
#if !defined(PNG_1_0_X)
5105
(png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB)? "MMX" :
5110
case 2: sprintf(filnm, "up-%s",
5111
#ifdef PNG_ASSEMBLER_CODE_SUPPORTED
5112
#if !defined(PNG_1_0_X)
5113
(png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP)? "MMX" :
5118
case 3: sprintf(filnm, "avg-%s",
5119
#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5120
#if !defined(PNG_1_0_X)
5121
(png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG)? "MMX" :
5126
case 4: sprintf(filnm, "Paeth-%s",
5127
#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5128
#if !defined(PNG_1_0_X)
5129
(png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH)? "MMX":
5134
default: sprintf(filnm, "unknw");
5137
png_debug2(0, "row_number=%5ld, %5s, ", png_ptr->row_number, filnm);
5138
png_debug1(0, "row=0x%08lx, ", (unsigned long)row);
5139
png_debug2(0, "pixdepth=%2d, bytes=%d, ", (int)row_info->pixel_depth,
5140
(int)((row_info->pixel_depth + 7) >> 3));
5141
png_debug1(0,"rowbytes=%8ld\n", row_info->rowbytes);
5142
#endif /* PNG_DEBUG */
5146
case PNG_FILTER_VALUE_NONE:
5149
case PNG_FILTER_VALUE_SUB:
5150
#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5151
#if !defined(PNG_1_0_X)
5152
if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
5153
(row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5154
(row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5159
png_read_filter_row_mmx_sub(row_info, row);
5162
#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5165
png_uint_32 istop = row_info->rowbytes;
5166
png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
5167
png_bytep rp = row + bpp;
5170
for (i = bpp; i < istop; i++)
5172
*rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
5175
} /* end !UseMMX_sub */
5178
case PNG_FILTER_VALUE_UP:
5179
#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
5180
#if !defined(PNG_1_0_X)
5181
if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
5182
(row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5183
(row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5188
png_read_filter_row_mmx_up(row_info, row, prev_row);
5191
#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5194
png_uint_32 istop = row_info->rowbytes;
5196
png_bytep pp = prev_row;
5198
for (i = 0; i < istop; ++i)
5200
*rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
5203
} /* end !UseMMX_up */
5206
case PNG_FILTER_VALUE_AVG:
5207
#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5208
#if !defined(PNG_1_0_X)
5209
if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
5210
(row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5211
(row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5216
png_read_filter_row_mmx_avg(row_info, row, prev_row);
5219
#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5223
png_bytep pp = prev_row;
5225
png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
5226
png_uint_32 istop = row_info->rowbytes - bpp;
5228
for (i = 0; i < bpp; i++)
5230
*rp = (png_byte)(((int)(*rp) +
5231
((int)(*pp++) >> 1)) & 0xff);
5235
for (i = 0; i < istop; i++)
5237
*rp = (png_byte)(((int)(*rp) +
5238
((int)(*pp++ + *lp++) >> 1)) & 0xff);
5241
} /* end !UseMMX_avg */
5244
case PNG_FILTER_VALUE_PAETH:
5245
#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5246
#if !defined(PNG_1_0_X)
5247
if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
5248
(row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5249
(row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5254
png_read_filter_row_mmx_paeth(row_info, row, prev_row);
5257
#endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5261
png_bytep pp = prev_row;
5263
png_bytep cp = prev_row;
5264
png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
5265
png_uint_32 istop = row_info->rowbytes - bpp;
5267
for (i = 0; i < bpp; i++)
5269
*rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
5273
for (i = 0; i < istop; i++) /* use leftover rp,pp */
5275
int a, b, c, pa, pb, pc, p;
5289
pa = p < 0 ? -p : p;
5290
pb = pc < 0 ? -pc : pc;
5291
pc = (p + pc) < 0 ? -(p + pc) : p + pc;
5295
if (pa <= pb && pa <= pc)
5303
p = (pa <= pb && pa <= pc) ? a : (pb <= pc) ? b : c;
5305
*rp = (png_byte)(((int)(*rp) + p) & 0xff);
5308
} /* end !UseMMX_paeth */
5312
png_warning(png_ptr, "Ignoring bad row-filter type");
5318
#endif /* PNG_HAVE_ASSEMBLER_READ_FILTER_ROW */
5321
/*===========================================================================*/
5323
/* P N G _ M M X _ S U P P O R T */
5325
/*===========================================================================*/
5327
/* GRR NOTES: (1) the following code assumes 386 or better (pushfl/popfl)
5328
* (2) all instructions compile with gcc 2.7.2.3 and later
5329
* (3) the function is moved down here to prevent gcc from
5330
* inlining it in multiple places and then barfing be-
5331
* cause the ".NOT_SUPPORTED" label is multiply defined
5332
* [is there a way to signal that a *single* function should
5333
* not be inlined? is there a way to modify the label for
5334
* each inlined instance, e.g., by appending _1, _2, etc.?
5335
* maybe if don't use leading "." in label name? (nope...sigh)]
5339
png_mmx_support(void)
5341
#if defined(PNG_MMX_CODE_SUPPORTED)
5342
__asm__ __volatile__ (
5343
"pushl %%ebx \n\t" // ebx gets clobbered by CPUID instruction
5344
"pushl %%ecx \n\t" // so does ecx...
5345
"pushl %%edx \n\t" // ...and edx (but ecx & edx safe on Linux)
5346
// ".byte 0x66 \n\t" // convert 16-bit pushf to 32-bit pushfd
5347
// "pushf \n\t" // 16-bit pushf
5348
"pushfl \n\t" // save Eflag to stack
5349
"popl %%eax \n\t" // get Eflag from stack into eax
5350
"movl %%eax, %%ecx \n\t" // make another copy of Eflag in ecx
5351
"xorl $0x200000, %%eax \n\t" // toggle ID bit in Eflag (i.e., bit 21)
5352
"pushl %%eax \n\t" // save modified Eflag back to stack
5353
// ".byte 0x66 \n\t" // convert 16-bit popf to 32-bit popfd
5354
// "popf \n\t" // 16-bit popf
5355
"popfl \n\t" // restore modified value to Eflag reg
5356
"pushfl \n\t" // save Eflag to stack
5357
"popl %%eax \n\t" // get Eflag from stack
5358
"pushl %%ecx \n\t" // save original Eflag to stack
5359
"popfl \n\t" // restore original Eflag
5360
"xorl %%ecx, %%eax \n\t" // compare new Eflag with original Eflag
5361
"jz 0f \n\t" // if same, CPUID instr. is not supported
5363
"xorl %%eax, %%eax \n\t" // set eax to zero
5364
// ".byte 0x0f, 0xa2 \n\t" // CPUID instruction (two-byte opcode)
5365
"cpuid \n\t" // get the CPU identification info
5366
"cmpl $1, %%eax \n\t" // make sure eax return non-zero value
5367
"jl 0f \n\t" // if eax is zero, MMX is not supported
5369
"xorl %%eax, %%eax \n\t" // set eax to zero and...
5370
"incl %%eax \n\t" // ...increment eax to 1. This pair is
5371
// faster than the instruction "mov eax, 1"
5372
"cpuid \n\t" // get the CPU identification info again
5373
"andl $0x800000, %%edx \n\t" // mask out all bits but MMX bit (23)
5374
"cmpl $0, %%edx \n\t" // 0 = MMX not supported
5375
"jz 0f \n\t" // non-zero = yes, MMX IS supported
5377
"movl $1, %%eax \n\t" // set return value to 1
5378
"jmp 1f \n\t" // DONE: have MMX support
5380
"0: \n\t" // .NOT_SUPPORTED: target label for jump instructions
5381
"movl $0, %%eax \n\t" // set return value to 0
5382
"1: \n\t" // .RETURN: target label for jump instructions
5383
"movl %%eax, _mmx_supported \n\t" // save in global static variable, too
5384
"popl %%edx \n\t" // restore edx
5385
"popl %%ecx \n\t" // restore ecx
5386
"popl %%ebx \n\t" // restore ebx
5388
// "ret \n\t" // DONE: no MMX support
5389
// (fall through to standard C "ret")
5391
: // output list (none)
5393
: // any variables used on input (none)
5395
: "%eax" // clobber list
5396
// , "%ebx", "%ecx", "%edx" // GRR: we handle these manually
5397
// , "memory" // if write to a variable gcc thought was in a reg
5398
// , "cc" // "condition codes" (flag bits)
5402
#endif /* PNG_MMX_CODE_SUPPORTED */
5404
return _mmx_supported;
5408
#endif /* PNG_USE_PNGGCCRD */