1
/* pnggccrd.c - mixed C/assembler version of utilities to read a PNG file
3
* For Intel x86 CPU (Pentium-MMX or later) and GNU C compiler.
5
* See http://www.intel.com/drg/pentiumII/appnotes/916/916.htm
6
* and http://www.intel.com/drg/pentiumII/appnotes/923/923.htm
7
* for Intel's performance analysis of the MMX vs. non-MMX code.
9
* libpng version 1.0.8 - July 24, 2000
10
* For conditions of distribution and use, see copyright notice in png.h
11
* Copyright (c) 1998, 1999, 2000 Glenn Randers-Pehrson
12
* Copyright (c) 1998, Intel Corporation
14
* Based on MSVC code contributed by Nirav Chhatrapati, Intel Corp., 1998.
15
* Interface to libpng contributed by Gilles Vollant, 1999.
16
* GNU C port by Greg Roelofs, 1999.
18
* Lines 2350-4300 converted in place with intel2gas 1.3.1:
20
* intel2gas -mdI pnggccrd.c.partially-msvc -o pnggccrd.c
22
* and then cleaned up by hand. See http://hermes.terminal.at/intel2gas/ .
24
* NOTE: A sufficiently recent version of GNU as (or as.exe under DOS/Windows)
25
* is required to assemble the newer MMX instructions such as movq.
28
* ftp://ftp.simtel.net/pub/simtelnet/gnu/djgpp/v2gnu/bnu281b.zip
30
* (or a later version in the same directory). For Linux, check your
31
* distribution's web site(s) or try these links:
33
* http://rufus.w3.org/linux/RPM/binutils.html
34
* http://www.debian.org/Packages/stable/devel/binutils.html
35
* ftp://ftp.slackware.com/pub/linux/slackware/slackware/slakware/d1/
38
* For other platforms, see the main GNU site:
40
* ftp://ftp.gnu.org/pub/gnu/binutils/
42
* Version 2.5.2l.15 is definitely too old...
46
* NOTES (mostly by Greg Roelofs)
50
* - fixed sign error in post-MMX cleanup code (16- & 32-bit cases)
53
* - additional optimizations (possible or definite):
54
* x [DONE] write MMX code for 64-bit case (pixel_bytes == 8) [not tested]
55
* - write MMX code for 48-bit case (pixel_bytes == 6)
56
* - figure out what's up with 24-bit case (pixel_bytes == 3):
57
* why subtract 8 from width_mmx in the pass 4/5 case?
58
* (only width_mmx case)
59
* x [DONE] replace pixel_bytes within each block with the true
60
* constant value (or are compilers smart enough to do that?)
61
* - rewrite all MMX interlacing code so it's aligned with
62
* the *beginning* of the row buffer, not the end. This
63
* would not only allow one to eliminate half of the memory
64
* writes for odd passes (i.e., pass == odd), it may also
65
* eliminate some unaligned-data-access exceptions (assuming
66
* there's a penalty for not aligning 64-bit accesses on
67
* 64-bit boundaries). The only catch is that the "leftover"
68
* pixel(s) at the end of the row would have to be saved,
69
* but there are enough unused MMX registers in every case,
70
* so this is not a problem. A further benefit is that the
71
* post-MMX cleanup code (C code) in at least some of the
72
* cases could be done within the assembler block.
73
* x [DONE] the "v3 v2 v1 v0 v7 v6 v5 v4" comments are confusing,
74
* inconsistent, and don't match the MMX Programmer's Reference
75
* Manual conventions anyway. They should be changed to
76
* "b7 b6 b5 b4 b3 b2 b1 b0," where b0 indicates the byte that
77
* was lowest in memory (e.g., corresponding to a left pixel)
78
* and b7 is the byte that was highest (e.g., a right pixel).
81
* - Brennan's Guide notwithstanding, gcc under Linux does *not*
82
* want globals prefixed by underscores when referencing them--
83
* i.e., if the variable is const4, then refer to it as const4,
84
* not _const4. This seems to be a djgpp-specific requirement.
85
* Also, such variables apparently *must* be declared outside
86
* of functions; neither static nor automatic variables work if
87
* defined within the scope of a single function, but both
88
* static and truly global (multi-module) variables work fine.
91
* - fixed png_combine_row() non-MMX replication bug (odd passes only?)
92
* - switched from string-concatenation-with-macros to cleaner method of
93
* renaming global variables for djgpp--i.e., always use prefixes in
94
* inlined assembler code (== strings) and conditionally rename the
95
* variables, not the other way around. Hence _const4, _mask8_0, etc.
98
* - fixed mmxsupport()/png_do_interlace() first-row bug
99
* This one was severely weird: even though mmxsupport() doesn't touch
100
* ebx (where "row" pointer was stored), it nevertheless managed to zero
101
* the register (even in static/non-fPIC code--see below), which in turn
102
* caused png_do_interlace() to return prematurely on the first row of
103
* interlaced images (i.e., without expanding the interlaced pixels).
104
* Inspection of the generated assembly code didn't turn up any clues,
105
* although it did point at a minor optimization (i.e., get rid of
106
* mmx_supported_local variable and just use eax). Possibly the CPUID
107
* instruction is more destructive than it looks? (Not yet checked.)
108
* - "info gcc" was next to useless, so compared fPIC and non-fPIC assembly
109
* listings... Apparently register spillage has to do with ebx, since
110
* it's used to index the global offset table. Commenting it out of the
111
* input-reg lists in png_combine_row() eliminated compiler barfage, so
112
* ifdef'd with __PIC__ macro: if defined, use a global for unmask
115
* - verified CPUID clobberage: 12-char string constant ("GenuineIntel",
116
* "AuthenticAMD", etc.) placed in EBX:ECX:EDX. Still need to polish.
119
* - made "diff" variable (now "_dif") global to simplify conversion of
120
* filtering routines (running out of regs, sigh). "diff" is still used
121
* in interlacing routines, however.
122
* - fixed up both versions of mmxsupport() (ORIG_THAT_USED_TO_CLOBBER_EBX
123
* macro determines which is used); original not yet tested.
126
* - When compiling with gcc, be sure to use -fomit-frame-pointer
129
* - fixed a register-name typo in png_do_read_interlace(), default (MMX) case,
130
* pass == 4 or 5, that caused visible corruption of interlaced images
133
* - Various problems were reported with gcc 2.95.2 in the Cygwin environment,
134
* many of the form "forbidden register 0 (ax) was spilled for class AREG."
135
* This is explained at http://gcc.gnu.org/fom_serv/cache/23.html, and
136
* Chuck Wilson supplied a patch involving dummy output registers. See
137
* http://sourceforge.net/bugs/?func=detailbug&bug_id=108741&group_id=5624
138
* for the original (anonymous) SourceForge bug report.
141
* - Chuck Wilson passed along these remaining gcc 2.95.2 errors:
142
* pnggccrd.c: In function `png_combine_row':
143
* pnggccrd.c:525: more than 10 operands in `asm'
144
* pnggccrd.c:669: more than 10 operands in `asm'
145
* pnggccrd.c:828: more than 10 operands in `asm'
146
* pnggccrd.c:994: more than 10 operands in `asm'
147
* pnggccrd.c:1177: more than 10 operands in `asm'
148
* They are all the same problem and can be worked around by using the
149
* global _unmask variable unconditionally, not just in the -fPIC case.
150
* Apparently earlier versions of gcc also have the problem with more than
151
* 10 operands; they just don't report it. Much strangeness ensues, etc.
157
#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_USE_PNGGCCRD)
159
int mmxsupport(void);
161
static int mmx_supported = 2;
163
#ifdef PNG_USE_LOCAL_ARRAYS
164
static const int png_pass_start[7] = {0, 4, 0, 2, 0, 1, 0};
165
static const int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
166
static const int png_pass_width[7] = {8, 4, 4, 2, 2, 1, 1};
169
// djgpp, Win32, and Cygwin add their own underscores to global variables,
170
// so define them without:
171
#if defined(__DJGPP__) || defined(WIN32) || defined(__CYGWIN__)
172
# define _unmask unmask
173
# define _const4 const4
174
# define _const6 const6
175
# define _mask8_0 mask8_0
176
# define _mask16_1 mask16_1
177
# define _mask16_0 mask16_0
178
# define _mask24_2 mask24_2
179
# define _mask24_1 mask24_1
180
# define _mask24_0 mask24_0
181
# define _mask32_3 mask32_3
182
# define _mask32_2 mask32_2
183
# define _mask32_1 mask32_1
184
# define _mask32_0 mask32_0
185
# define _mask48_5 mask48_5
186
# define _mask48_4 mask48_4
187
# define _mask48_3 mask48_3
188
# define _mask48_2 mask48_2
189
# define _mask48_1 mask48_1
190
# define _mask48_0 mask48_0
191
# define _FullLength FullLength
192
# define _MMXLength MMXLength
196
/* These constants are used in the inlined MMX assembly code.
197
Ignore gcc's "At top level: defined but not used" warnings. */
199
/* GRR 20000706: originally _unmask was needed only when compiling with -fPIC,
200
* since that case uses the %ebx register for indexing the Global Offset Table
201
* and there were no other registers available. But gcc 2.95 and later emit
202
* "more than 10 operands in `asm'" errors when %ebx is used to preload unmask
203
* in the non-PIC case, so we'll just use the global unconditionally now.
207
static unsigned long long _mask8_0 = 0x0102040810204080LL;
209
static unsigned long long _mask16_1 = 0x0101020204040808LL;
210
static unsigned long long _mask16_0 = 0x1010202040408080LL;
212
static unsigned long long _mask24_2 = 0x0101010202020404LL;
213
static unsigned long long _mask24_1 = 0x0408080810101020LL;
214
static unsigned long long _mask24_0 = 0x2020404040808080LL;
216
static unsigned long long _mask32_3 = 0x0101010102020202LL;
217
static unsigned long long _mask32_2 = 0x0404040408080808LL;
218
static unsigned long long _mask32_1 = 0x1010101020202020LL;
219
static unsigned long long _mask32_0 = 0x4040404080808080LL;
221
static unsigned long long _mask48_5 = 0x0101010101010202LL;
222
static unsigned long long _mask48_4 = 0x0202020204040404LL;
223
static unsigned long long _mask48_3 = 0x0404080808080808LL;
224
static unsigned long long _mask48_2 = 0x1010101010102020LL;
225
static unsigned long long _mask48_1 = 0x2020202040404040LL;
226
static unsigned long long _mask48_0 = 0x4040808080808080LL;
228
static unsigned long long _const4 = 0x0000000000FFFFFFLL;
229
//static unsigned long long _const5 = 0x000000FFFFFF0000LL; // NOT USED
230
static unsigned long long _const6 = 0x00000000000000FFLL;
232
// These are used in the row-filter routines and should/would be local
233
// variables if not for gcc addressing limitations.
235
static png_uint_32 _FullLength;
236
static png_uint_32 _MMXLength;
241
png_read_filter_row_c(png_structp png_ptr, png_row_infop row_info,
242
png_bytep row, png_bytep prev_row, int filter);
245
#if defined(PNG_HAVE_ASSEMBLER_COMBINE_ROW)
247
/* Combines the row recently read in with the previous row.
248
This routine takes care of alpha and transparency if requested.
249
This routine also handles the two methods of progressive display
250
of interlaced images, depending on the mask value.
251
The mask value describes which pixels are to be combined with
252
the row. The pattern always repeats every 8 pixels, so just 8
253
bits are needed. A one indicates the pixel is to be combined; a
254
zero indicates the pixel is to be skipped. This is in addition
255
to any alpha or transparency value associated with the pixel.
256
If you want all pixels to be combined, pass 0xff (255) in mask. */
258
/* Use this routine for the x86 platform - it uses a faster MMX routine
259
if the machine supports MMX. */
262
png_combine_row(png_structp png_ptr, png_bytep row, int mask)
264
png_debug(1,"in png_combine_row_asm\n");
266
if (mmx_supported == 2)
267
mmx_supported = mmxsupport();
270
fprintf(stderr, "GRR DEBUG: png_combine_row() pixel_depth = %d, mask = 0x%02x, unmask = 0x%02x\n", png_ptr->row_info.pixel_depth, mask, ~mask);
275
png_memcpy(row, png_ptr->row_buf + 1,
276
(png_size_t)((png_ptr->width * png_ptr->row_info.pixel_depth + 7) >> 3));
278
/* GRR: add "else if (mask == 0)" case?
279
* or does png_combine_row() not even get called in that case? */
282
switch (png_ptr->row_info.pixel_depth)
284
case 1: // png_ptr->row_info.pixel_depth
288
int s_inc, s_start, s_end;
293
sp = png_ptr->row_buf + 1;
296
#if defined(PNG_READ_PACKSWAP_SUPPORTED)
297
if (png_ptr->transformations & PNG_PACKSWAP)
313
for (i = 0; i < png_ptr->width; i++)
319
value = (*sp >> shift) & 0x1;
320
*dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
321
*dp |= (png_byte)(value << shift);
341
case 2: // png_ptr->row_info.pixel_depth
345
int s_start, s_end, s_inc;
351
sp = png_ptr->row_buf + 1;
354
#if defined(PNG_READ_PACKSWAP_SUPPORTED)
355
if (png_ptr->transformations & PNG_PACKSWAP)
371
for (i = 0; i < png_ptr->width; i++)
375
value = (*sp >> shift) & 0x3;
376
*dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
377
*dp |= (png_byte)(value << shift);
396
case 4: // png_ptr->row_info.pixel_depth
400
int s_start, s_end, s_inc;
406
sp = png_ptr->row_buf + 1;
409
#if defined(PNG_READ_PACKSWAP_SUPPORTED)
410
if (png_ptr->transformations & PNG_PACKSWAP)
425
for (i = 0; i < png_ptr->width; i++)
429
value = (*sp >> shift) & 0xf;
430
*dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
431
*dp |= (png_byte)(value << shift);
450
case 8: // png_ptr->row_info.pixel_depth
459
int dummy_value_a; // fix 'forbidden register spilled' error
464
_unmask = ~mask; // global variable for -fPIC version
465
srcptr = png_ptr->row_buf + 1;
467
len = png_ptr->width &~7; // reduce to multiple of 8
468
diff = png_ptr->width & 7; // amount lost
470
__asm__ __volatile__ (
471
"movd _unmask, %%mm7 \n\t" // load bit pattern
472
"psubb %%mm6, %%mm6 \n\t" // zero mm6
473
"punpcklbw %%mm7, %%mm7 \n\t"
474
"punpcklwd %%mm7, %%mm7 \n\t"
475
"punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
477
"movq _mask8_0, %%mm0 \n\t"
478
"pand %%mm7, %%mm0 \n\t" // nonzero if keep byte
479
"pcmpeqb %%mm6, %%mm0 \n\t" // zeros->1s, v versa
481
// preload "movl len, %%ecx \n\t" // load length of line
482
// preload "movl srcptr, %%esi \n\t" // load source
483
// preload "movl dstptr, %%edi \n\t" // load dest
485
"cmpl $0, %%ecx \n\t" // len == 0 ?
486
"je mainloop8end \n\t"
489
"movq (%%esi), %%mm4 \n\t" // *srcptr
490
"pand %%mm0, %%mm4 \n\t"
491
"movq %%mm0, %%mm6 \n\t"
492
"pandn (%%edi), %%mm6 \n\t" // *dstptr
493
"por %%mm6, %%mm4 \n\t"
494
"movq %%mm4, (%%edi) \n\t"
495
"addl $8, %%esi \n\t" // inc by 8 bytes processed
496
"addl $8, %%edi \n\t"
497
"subl $8, %%ecx \n\t" // dec by 8 pixels processed
501
// preload "movl diff, %%ecx \n\t" // (diff is in eax)
502
"movl %%eax, %%ecx \n\t"
503
"cmpl $0, %%ecx \n\t"
505
// preload "movl mask, %%edx \n\t"
506
"sall $24, %%edx \n\t" // make low byte, high byte
509
"sall %%edx \n\t" // move high bit to CF
510
"jnc skip8 \n\t" // if CF = 0
511
"movb (%%esi), %%al \n\t"
512
"movb %%al, (%%edi) \n\t"
518
"jnz secondloop8 \n\t"
523
: "=a" (dummy_value_a), // output regs (dummy)
524
"=d" (dummy_value_d),
525
"=c" (dummy_value_c),
526
"=S" (dummy_value_S),
529
: "3" (srcptr), // esi // input regs
532
// was (unmask) "b" RESERVED // ebx // Global Offset Table idx
537
#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
538
: "%mm0", "%mm4", "%mm6", "%mm7"
542
else /* mmx _not supported - Use modified C routine */
544
register png_uint_32 i;
545
png_uint_32 initial_val = png_pass_start[png_ptr->pass];
546
// png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0};
547
register int stride = png_pass_inc[png_ptr->pass];
548
// png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
549
register int rep_bytes = png_pass_width[png_ptr->pass];
550
// png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1};
551
register png_uint_32 final_val = png_ptr->width;
553
srcptr = png_ptr->row_buf + 1 + initial_val;
554
dstptr = row + initial_val;
556
for (i = initial_val; i < final_val; i += stride)
558
png_memcpy(dstptr, srcptr, rep_bytes);
567
case 16: // png_ptr->row_info.pixel_depth
576
int dummy_value_a; // fix 'forbidden register spilled' error
581
_unmask = ~mask; // global variable for -fPIC version
582
srcptr = png_ptr->row_buf + 1;
584
len = png_ptr->width &~7; // reduce to multiple of 8
585
diff = png_ptr->width & 7; // amount lost
587
__asm__ __volatile__ (
588
"movd _unmask, %%mm7 \n\t" // load bit pattern
589
"psubb %%mm6, %%mm6 \n\t" // zero mm6
590
"punpcklbw %%mm7, %%mm7 \n\t"
591
"punpcklwd %%mm7, %%mm7 \n\t"
592
"punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
594
"movq _mask16_0, %%mm0 \n\t"
595
"movq _mask16_1, %%mm1 \n\t"
597
"pand %%mm7, %%mm0 \n\t"
598
"pand %%mm7, %%mm1 \n\t"
600
"pcmpeqb %%mm6, %%mm0 \n\t"
601
"pcmpeqb %%mm6, %%mm1 \n\t"
603
// preload "movl len, %%ecx \n\t" // load length of line
604
// preload "movl srcptr, %%esi \n\t" // load source
605
// preload "movl dstptr, %%edi \n\t" // load dest
607
"cmpl $0, %%ecx \n\t"
608
"jz mainloop16end \n\t"
611
"movq (%%esi), %%mm4 \n\t"
612
"pand %%mm0, %%mm4 \n\t"
613
"movq %%mm0, %%mm6 \n\t"
614
"movq (%%edi), %%mm7 \n\t"
615
"pandn %%mm7, %%mm6 \n\t"
616
"por %%mm6, %%mm4 \n\t"
617
"movq %%mm4, (%%edi) \n\t"
619
"movq 8(%%esi), %%mm5 \n\t"
620
"pand %%mm1, %%mm5 \n\t"
621
"movq %%mm1, %%mm7 \n\t"
622
"movq 8(%%edi), %%mm6 \n\t"
623
"pandn %%mm6, %%mm7 \n\t"
624
"por %%mm7, %%mm5 \n\t"
625
"movq %%mm5, 8(%%edi) \n\t"
627
"addl $16, %%esi \n\t" // inc by 16 bytes processed
628
"addl $16, %%edi \n\t"
629
"subl $8, %%ecx \n\t" // dec by 8 pixels processed
632
"mainloop16end: \n\t"
633
// preload "movl diff, %%ecx \n\t" // (diff is in eax)
634
"movl %%eax, %%ecx \n\t"
635
"cmpl $0, %%ecx \n\t"
637
// preload "movl mask, %%edx \n\t"
638
"sall $24, %%edx \n\t" // make low byte, high byte
641
"sall %%edx \n\t" // move high bit to CF
642
"jnc skip16 \n\t" // if CF = 0
643
"movw (%%esi), %%ax \n\t"
644
"movw %%ax, (%%edi) \n\t"
647
"addl $2, %%esi \n\t"
648
"addl $2, %%edi \n\t"
650
"jnz secondloop16 \n\t"
655
: "=a" (dummy_value_a), // output regs (dummy)
656
"=d" (dummy_value_d),
657
"=c" (dummy_value_c),
658
"=S" (dummy_value_S),
661
: "3" (srcptr), // esi // input regs
664
// was (unmask) "b" RESERVED // ebx // Global Offset Table idx
669
#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
671
"%mm4", "%mm5", "%mm6", "%mm7"
675
else /* mmx _not supported - Use modified C routine */
677
register png_uint_32 i;
678
png_uint_32 initial_val = 2 * png_pass_start[png_ptr->pass];
679
// png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0};
680
register int stride = 2 * png_pass_inc[png_ptr->pass];
681
// png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
682
register int rep_bytes = 2 * png_pass_width[png_ptr->pass];
683
// png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1};
684
register png_uint_32 final_val = 2 * png_ptr->width;
686
srcptr = png_ptr->row_buf + 1 + initial_val;
687
dstptr = row + initial_val;
689
for (i = initial_val; i < final_val; i += stride)
691
png_memcpy(dstptr, srcptr, rep_bytes);
700
case 24: // png_ptr->row_info.pixel_depth
709
int dummy_value_a; // fix 'forbidden register spilled' error
714
_unmask = ~mask; // global variable for -fPIC version
715
srcptr = png_ptr->row_buf + 1;
717
len = png_ptr->width &~7; // reduce to multiple of 8
718
diff = png_ptr->width & 7; // amount lost
720
__asm__ __volatile__ (
721
"movd _unmask, %%mm7 \n\t" // load bit pattern
722
"psubb %%mm6, %%mm6 \n\t" // zero mm6
723
"punpcklbw %%mm7, %%mm7 \n\t"
724
"punpcklwd %%mm7, %%mm7 \n\t"
725
"punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
727
"movq _mask24_0, %%mm0 \n\t"
728
"movq _mask24_1, %%mm1 \n\t"
729
"movq _mask24_2, %%mm2 \n\t"
731
"pand %%mm7, %%mm0 \n\t"
732
"pand %%mm7, %%mm1 \n\t"
733
"pand %%mm7, %%mm2 \n\t"
735
"pcmpeqb %%mm6, %%mm0 \n\t"
736
"pcmpeqb %%mm6, %%mm1 \n\t"
737
"pcmpeqb %%mm6, %%mm2 \n\t"
739
// preload "movl len, %%ecx \n\t" // load length of line
740
// preload "movl srcptr, %%esi \n\t" // load source
741
// preload "movl dstptr, %%edi \n\t" // load dest
743
"cmpl $0, %%ecx \n\t"
744
"jz mainloop24end \n\t"
747
"movq (%%esi), %%mm4 \n\t"
748
"pand %%mm0, %%mm4 \n\t"
749
"movq %%mm0, %%mm6 \n\t"
750
"movq (%%edi), %%mm7 \n\t"
751
"pandn %%mm7, %%mm6 \n\t"
752
"por %%mm6, %%mm4 \n\t"
753
"movq %%mm4, (%%edi) \n\t"
755
"movq 8(%%esi), %%mm5 \n\t"
756
"pand %%mm1, %%mm5 \n\t"
757
"movq %%mm1, %%mm7 \n\t"
758
"movq 8(%%edi), %%mm6 \n\t"
759
"pandn %%mm6, %%mm7 \n\t"
760
"por %%mm7, %%mm5 \n\t"
761
"movq %%mm5, 8(%%edi) \n\t"
763
"movq 16(%%esi), %%mm6 \n\t"
764
"pand %%mm2, %%mm6 \n\t"
765
"movq %%mm2, %%mm4 \n\t"
766
"movq 16(%%edi), %%mm7 \n\t"
767
"pandn %%mm7, %%mm4 \n\t"
768
"por %%mm4, %%mm6 \n\t"
769
"movq %%mm6, 16(%%edi) \n\t"
771
"addl $24, %%esi \n\t" // inc by 24 bytes processed
772
"addl $24, %%edi \n\t"
773
"subl $8, %%ecx \n\t" // dec by 8 pixels processed
777
"mainloop24end: \n\t"
778
// preload "movl diff, %%ecx \n\t" // (diff is in eax)
779
"movl %%eax, %%ecx \n\t"
780
"cmpl $0, %%ecx \n\t"
782
// preload "movl mask, %%edx \n\t"
783
"sall $24, %%edx \n\t" // make low byte, high byte
786
"sall %%edx \n\t" // move high bit to CF
787
"jnc skip24 \n\t" // if CF = 0
788
"movw (%%esi), %%ax \n\t"
789
"movw %%ax, (%%edi) \n\t"
790
"xorl %%eax, %%eax \n\t"
791
"movb 2(%%esi), %%al \n\t"
792
"movb %%al, 2(%%edi) \n\t"
795
"addl $3, %%esi \n\t"
796
"addl $3, %%edi \n\t"
798
"jnz secondloop24 \n\t"
803
: "=a" (dummy_value_a), // output regs (dummy)
804
"=d" (dummy_value_d),
805
"=c" (dummy_value_c),
806
"=S" (dummy_value_S),
809
: "3" (srcptr), // esi // input regs
812
// was (unmask) "b" RESERVED // ebx // Global Offset Table idx
817
#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
818
: "%mm0", "%mm1", "%mm2",
819
"%mm4", "%mm5", "%mm6", "%mm7"
823
else /* mmx _not supported - Use modified C routine */
825
register png_uint_32 i;
826
png_uint_32 initial_val = 3 * png_pass_start[png_ptr->pass];
827
// png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0};
828
register int stride = 3 * png_pass_inc[png_ptr->pass];
829
// png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
830
register int rep_bytes = 3 * png_pass_width[png_ptr->pass];
831
// png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1};
832
register png_uint_32 final_val = 3 * png_ptr->width;
834
srcptr = png_ptr->row_buf + 1 + initial_val;
835
dstptr = row + initial_val;
837
for (i = initial_val; i < final_val; i += stride)
839
png_memcpy(dstptr, srcptr, rep_bytes);
848
case 32: // png_ptr->row_info.pixel_depth
857
int dummy_value_a; // fix 'forbidden register spilled' error
862
_unmask = ~mask; // global variable for -fPIC version
863
srcptr = png_ptr->row_buf + 1;
865
len = png_ptr->width &~7; // reduce to multiple of 8
866
diff = png_ptr->width & 7; // amount lost
868
__asm__ __volatile__ (
869
"movd _unmask, %%mm7 \n\t" // load bit pattern
870
"psubb %%mm6, %%mm6 \n\t" // zero mm6
871
"punpcklbw %%mm7, %%mm7 \n\t"
872
"punpcklwd %%mm7, %%mm7 \n\t"
873
"punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
875
"movq _mask32_0, %%mm0 \n\t"
876
"movq _mask32_1, %%mm1 \n\t"
877
"movq _mask32_2, %%mm2 \n\t"
878
"movq _mask32_3, %%mm3 \n\t"
880
"pand %%mm7, %%mm0 \n\t"
881
"pand %%mm7, %%mm1 \n\t"
882
"pand %%mm7, %%mm2 \n\t"
883
"pand %%mm7, %%mm3 \n\t"
885
"pcmpeqb %%mm6, %%mm0 \n\t"
886
"pcmpeqb %%mm6, %%mm1 \n\t"
887
"pcmpeqb %%mm6, %%mm2 \n\t"
888
"pcmpeqb %%mm6, %%mm3 \n\t"
890
// preload "movl len, %%ecx \n\t" // load length of line
891
// preload "movl srcptr, %%esi \n\t" // load source
892
// preload "movl dstptr, %%edi \n\t" // load dest
894
"cmpl $0, %%ecx \n\t" // lcr
895
"jz mainloop32end \n\t"
898
"movq (%%esi), %%mm4 \n\t"
899
"pand %%mm0, %%mm4 \n\t"
900
"movq %%mm0, %%mm6 \n\t"
901
"movq (%%edi), %%mm7 \n\t"
902
"pandn %%mm7, %%mm6 \n\t"
903
"por %%mm6, %%mm4 \n\t"
904
"movq %%mm4, (%%edi) \n\t"
906
"movq 8(%%esi), %%mm5 \n\t"
907
"pand %%mm1, %%mm5 \n\t"
908
"movq %%mm1, %%mm7 \n\t"
909
"movq 8(%%edi), %%mm6 \n\t"
910
"pandn %%mm6, %%mm7 \n\t"
911
"por %%mm7, %%mm5 \n\t"
912
"movq %%mm5, 8(%%edi) \n\t"
914
"movq 16(%%esi), %%mm6 \n\t"
915
"pand %%mm2, %%mm6 \n\t"
916
"movq %%mm2, %%mm4 \n\t"
917
"movq 16(%%edi), %%mm7 \n\t"
918
"pandn %%mm7, %%mm4 \n\t"
919
"por %%mm4, %%mm6 \n\t"
920
"movq %%mm6, 16(%%edi) \n\t"
922
"movq 24(%%esi), %%mm7 \n\t"
923
"pand %%mm3, %%mm7 \n\t"
924
"movq %%mm3, %%mm5 \n\t"
925
"movq 24(%%edi), %%mm4 \n\t"
926
"pandn %%mm4, %%mm5 \n\t"
927
"por %%mm5, %%mm7 \n\t"
928
"movq %%mm7, 24(%%edi) \n\t"
930
"addl $32, %%esi \n\t" // inc by 32 bytes processed
931
"addl $32, %%edi \n\t"
932
"subl $8, %%ecx \n\t" // dec by 8 pixels processed
935
"mainloop32end: \n\t"
936
// preload "movl diff, %%ecx \n\t" // (diff is in eax)
937
"movl %%eax, %%ecx \n\t"
938
"cmpl $0, %%ecx \n\t"
940
// preload "movl mask, %%edx \n\t"
941
"sall $24, %%edx \n\t" // low byte => high byte
944
"sall %%edx \n\t" // move high bit to CF
945
"jnc skip32 \n\t" // if CF = 0
946
"movl (%%esi), %%eax \n\t"
947
"movl %%eax, (%%edi) \n\t"
950
"addl $4, %%esi \n\t"
951
"addl $4, %%edi \n\t"
953
"jnz secondloop32 \n\t"
958
: "=a" (dummy_value_a), // output regs (dummy)
959
"=d" (dummy_value_d),
960
"=c" (dummy_value_c),
961
"=S" (dummy_value_S),
964
: "3" (srcptr), // esi // input regs
967
// was (unmask) "b" RESERVED // ebx // Global Offset Table idx
972
#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
973
: "%mm0", "%mm1", "%mm2", "%mm3",
974
"%mm4", "%mm5", "%mm6", "%mm7"
978
else /* mmx _not supported - Use modified C routine */
980
register png_uint_32 i;
981
png_uint_32 initial_val = 4 * png_pass_start[png_ptr->pass];
982
// png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0};
983
register int stride = 4 * png_pass_inc[png_ptr->pass];
984
// png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
985
register int rep_bytes = 4 * png_pass_width[png_ptr->pass];
986
// png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1};
987
register png_uint_32 final_val = 4 * png_ptr->width;
989
srcptr = png_ptr->row_buf + 1 + initial_val;
990
dstptr = row + initial_val;
992
for (i = initial_val; i < final_val; i += stride)
994
png_memcpy(dstptr, srcptr, rep_bytes);
1003
case 48: // png_ptr->row_info.pixel_depth
1012
int dummy_value_a; // fix 'forbidden register spilled' error
1017
_unmask = ~mask; // global variable for -fPIC version
1018
srcptr = png_ptr->row_buf + 1;
1020
len = png_ptr->width &~7; // reduce to multiple of 8
1021
diff = png_ptr->width & 7; // amount lost
1023
__asm__ __volatile__ (
1024
"movd _unmask, %%mm7 \n\t" // load bit pattern
1025
"psubb %%mm6, %%mm6 \n\t" // zero mm6
1026
"punpcklbw %%mm7, %%mm7 \n\t"
1027
"punpcklwd %%mm7, %%mm7 \n\t"
1028
"punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
1030
"movq _mask48_0, %%mm0 \n\t"
1031
"movq _mask48_1, %%mm1 \n\t"
1032
"movq _mask48_2, %%mm2 \n\t"
1033
"movq _mask48_3, %%mm3 \n\t"
1034
"movq _mask48_4, %%mm4 \n\t"
1035
"movq _mask48_5, %%mm5 \n\t"
1037
"pand %%mm7, %%mm0 \n\t"
1038
"pand %%mm7, %%mm1 \n\t"
1039
"pand %%mm7, %%mm2 \n\t"
1040
"pand %%mm7, %%mm3 \n\t"
1041
"pand %%mm7, %%mm4 \n\t"
1042
"pand %%mm7, %%mm5 \n\t"
1044
"pcmpeqb %%mm6, %%mm0 \n\t"
1045
"pcmpeqb %%mm6, %%mm1 \n\t"
1046
"pcmpeqb %%mm6, %%mm2 \n\t"
1047
"pcmpeqb %%mm6, %%mm3 \n\t"
1048
"pcmpeqb %%mm6, %%mm4 \n\t"
1049
"pcmpeqb %%mm6, %%mm5 \n\t"
1051
// preload "movl len, %%ecx \n\t" // load length of line
1052
// preload "movl srcptr, %%esi \n\t" // load source
1053
// preload "movl dstptr, %%edi \n\t" // load dest
1055
"cmpl $0, %%ecx \n\t"
1056
"jz mainloop48end \n\t"
1059
"movq (%%esi), %%mm7 \n\t"
1060
"pand %%mm0, %%mm7 \n\t"
1061
"movq %%mm0, %%mm6 \n\t"
1062
"pandn (%%edi), %%mm6 \n\t"
1063
"por %%mm6, %%mm7 \n\t"
1064
"movq %%mm7, (%%edi) \n\t"
1066
"movq 8(%%esi), %%mm6 \n\t"
1067
"pand %%mm1, %%mm6 \n\t"
1068
"movq %%mm1, %%mm7 \n\t"
1069
"pandn 8(%%edi), %%mm7 \n\t"
1070
"por %%mm7, %%mm6 \n\t"
1071
"movq %%mm6, 8(%%edi) \n\t"
1073
"movq 16(%%esi), %%mm6 \n\t"
1074
"pand %%mm2, %%mm6 \n\t"
1075
"movq %%mm2, %%mm7 \n\t"
1076
"pandn 16(%%edi), %%mm7 \n\t"
1077
"por %%mm7, %%mm6 \n\t"
1078
"movq %%mm6, 16(%%edi) \n\t"
1080
"movq 24(%%esi), %%mm7 \n\t"
1081
"pand %%mm3, %%mm7 \n\t"
1082
"movq %%mm3, %%mm6 \n\t"
1083
"pandn 24(%%edi), %%mm6 \n\t"
1084
"por %%mm6, %%mm7 \n\t"
1085
"movq %%mm7, 24(%%edi) \n\t"
1087
"movq 32(%%esi), %%mm6 \n\t"
1088
"pand %%mm4, %%mm6 \n\t"
1089
"movq %%mm4, %%mm7 \n\t"
1090
"pandn 32(%%edi), %%mm7 \n\t"
1091
"por %%mm7, %%mm6 \n\t"
1092
"movq %%mm6, 32(%%edi) \n\t"
1094
"movq 40(%%esi), %%mm7 \n\t"
1095
"pand %%mm5, %%mm7 \n\t"
1096
"movq %%mm5, %%mm6 \n\t"
1097
"pandn 40(%%edi), %%mm6 \n\t"
1098
"por %%mm6, %%mm7 \n\t"
1099
"movq %%mm7, 40(%%edi) \n\t"
1101
"addl $48, %%esi \n\t" // inc by 48 bytes processed
1102
"addl $48, %%edi \n\t"
1103
"subl $8, %%ecx \n\t" // dec by 8 pixels processed
1105
"ja mainloop48 \n\t"
1107
"mainloop48end: \n\t"
1108
// preload "movl diff, %%ecx \n\t" // (diff is in eax)
1109
"movl %%eax, %%ecx \n\t"
1110
"cmpl $0, %%ecx \n\t"
1112
// preload "movl mask, %%edx \n\t"
1113
"sall $24, %%edx \n\t" // make low byte, high byte
1115
"secondloop48: \n\t"
1116
"sall %%edx \n\t" // move high bit to CF
1117
"jnc skip48 \n\t" // if CF = 0
1118
"movl (%%esi), %%eax \n\t"
1119
"movl %%eax, (%%edi) \n\t"
1122
"addl $4, %%esi \n\t"
1123
"addl $4, %%edi \n\t"
1125
"jnz secondloop48 \n\t"
1130
: "=a" (dummy_value_a), // output regs (dummy)
1131
"=d" (dummy_value_d),
1132
"=c" (dummy_value_c),
1133
"=S" (dummy_value_S),
1134
"=D" (dummy_value_D)
1136
: "3" (srcptr), // esi // input regs
1137
"4" (dstptr), // edi
1139
// was (unmask) "b" RESERVED // ebx // Global Offset Table idx
1143
// : // clobber list
1144
#if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
1145
: "%mm0", "%mm1", "%mm2", "%mm3",
1146
"%mm4", "%mm5", "%mm6", "%mm7"
1150
else /* mmx _not supported - Use modified C routine */
1152
register png_uint_32 i;
1153
png_uint_32 initial_val = 6 * png_pass_start[png_ptr->pass];
1154
// png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0};
1155
register int stride = 6 * png_pass_inc[png_ptr->pass];
1156
// png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
1157
register int rep_bytes = 6 * png_pass_width[png_ptr->pass];
1158
// png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1};
1159
register png_uint_32 final_val = 6 * png_ptr->width;
1161
srcptr = png_ptr->row_buf + 1 + initial_val;
1162
dstptr = row + initial_val;
1164
for (i = initial_val; i < final_val; i += stride)
1166
png_memcpy(dstptr, srcptr, rep_bytes);
1175
case 64: // png_ptr->row_info.pixel_depth
1179
register png_uint_32 i;
1180
png_uint_32 initial_val = 8 * png_pass_start[png_ptr->pass];
1181
// png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0};
1182
register int stride = 8 * png_pass_inc[png_ptr->pass];
1183
// png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
1184
register int rep_bytes = 8 * png_pass_width[png_ptr->pass];
1185
// png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1};
1186
register png_uint_32 final_val = 8 * png_ptr->width;
1188
srcptr = png_ptr->row_buf + 1 + initial_val;
1189
dstptr = row + initial_val;
1191
for (i = initial_val; i < final_val; i += stride)
1193
png_memcpy(dstptr, srcptr, rep_bytes);
1200
default: // png_ptr->row_info.pixel_depth != 1,2,4,8,16,24,32,48,64
1202
// this should never happen
1204
"libpng internal error: png_ptr->row_info.pixel_depth = %d\n",
1205
png_ptr->row_info.pixel_depth);
1209
} /* end switch (png_ptr->row_info.pixel_depth) */
1211
} /* end if (non-trivial mask) */
1213
} /* end png_combine_row() */
1215
#endif /* PNG_HAVE_ASSEMBLER_COMBINE_ROW */
1219
#if defined(PNG_READ_INTERLACING_SUPPORTED)
1220
#if defined(PNG_HAVE_ASSEMBLER_READ_INTERLACE)
1222
/* png_do_read_interlace() is called after any 16-bit to 8-bit conversion
1223
* has taken place. [GRR: what other steps come before and/or after?]
1227
png_do_read_interlace(png_row_infop row_info, png_bytep row, int pass,
1228
png_uint_32 transformations)
1231
fprintf(stderr, "GRR DEBUG: entering png_do_read_interlace()\n");
1232
if (row == NULL) fprintf(stderr, "GRR DEBUG: row == NULL\n");
1233
if (row_info == NULL) fprintf(stderr, "GRR DEBUG: row_info == NULL\n");
1236
png_debug(1,"in png_do_read_interlace\n");
1238
if (mmx_supported == 2)
1239
mmx_supported = mmxsupport();
1242
fprintf(stderr, "GRR DEBUG: calling mmxsupport()\n");
1243
fprintf(stderr, "GRR DEBUG: done with mmxsupport() (mmx_supported = %d)\n", mmx_supported);
1248
this one happened on first row due to weirdness with mmxsupport():
1249
if (row == NULL) fprintf(stderr, "GRR DEBUG: now row == NULL!!!\n");
1250
row was in ebx, and even though nothing touched ebx, it still got wiped...
1251
[weird side effect of CPUID instruction?]
1252
if (row_info == NULL) fprintf(stderr, "GRR DEBUG: now row_info == NULL!!!\n");
1254
if (row != NULL && row_info != NULL)
1256
png_uint_32 final_width;
1258
final_width = row_info->width * png_pass_inc[pass];
1261
fprintf(stderr, "GRR DEBUG: png_do_read_interlace() row_info->width = %d, final_width = %d\n", row_info->width, final_width);
1262
fprintf(stderr, "GRR DEBUG: png_do_read_interlace() pixel_depth = %d\n", row_info->pixel_depth);
1265
switch (row_info->pixel_depth)
1271
int s_start, s_end, s_inc;
1276
sp = row + (png_size_t)((row_info->width - 1) >> 3);
1277
dp = row + (png_size_t)((final_width - 1) >> 3);
1278
#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1279
if (transformations & PNG_PACKSWAP)
1281
sshift = (int)((row_info->width + 7) & 7);
1282
dshift = (int)((final_width + 7) & 7);
1290
sshift = 7 - (int)((row_info->width + 7) & 7);
1291
dshift = 7 - (int)((final_width + 7) & 7);
1297
for (i = row_info->width; i; i--)
1299
v = (png_byte)((*sp >> sshift) & 0x1);
1300
for (j = 0; j < png_pass_inc[pass]; j++)
1302
*dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
1303
*dp |= (png_byte)(v << dshift);
1304
if (dshift == s_end)
1312
if (sshift == s_end)
1327
int s_start, s_end, s_inc;
1330
sp = row + (png_size_t)((row_info->width - 1) >> 2);
1331
dp = row + (png_size_t)((final_width - 1) >> 2);
1332
#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1333
if (transformations & PNG_PACKSWAP)
1335
sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
1336
dshift = (png_size_t)(((final_width + 3) & 3) << 1);
1344
sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
1345
dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
1351
for (i = row_info->width; i; i--)
1356
v = (png_byte)((*sp >> sshift) & 0x3);
1357
for (j = 0; j < png_pass_inc[pass]; j++)
1359
*dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
1360
*dp |= (png_byte)(v << dshift);
1361
if (dshift == s_end)
1369
if (sshift == s_end)
1384
int s_start, s_end, s_inc;
1387
sp = row + (png_size_t)((row_info->width - 1) >> 1);
1388
dp = row + (png_size_t)((final_width - 1) >> 1);
1389
#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1390
if (transformations & PNG_PACKSWAP)
1392
sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
1393
dshift = (png_size_t)(((final_width + 1) & 1) << 2);
1401
sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
1402
dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
1408
for (i = row_info->width; i; i--)
1413
v = (png_byte)((*sp >> sshift) & 0xf);
1414
for (j = 0; j < png_pass_inc[pass]; j++)
1416
*dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
1417
*dp |= (png_byte)(v << dshift);
1418
if (dshift == s_end)
1426
if (sshift == s_end)
1437
//====================================================================
1439
default: // 8-bit or larger (this is where the routine is modified)
1441
// static unsigned long long _const4 = 0x0000000000FFFFFFLL; no good
1442
// static unsigned long long const4 = 0x0000000000FFFFFFLL; no good
1443
// unsigned long long _const4 = 0x0000000000FFFFFFLL; no good
1444
// unsigned long long const4 = 0x0000000000FFFFFFLL; no good
1447
png_size_t pixel_bytes;
1448
int width = row_info->width;
1450
pixel_bytes = (row_info->pixel_depth >> 3);
1452
// point sptr at the last pixel in the pre-expanded row:
1453
sptr = row + (width - 1) * pixel_bytes;
1455
// point dp at the last pixel position in the expanded row:
1456
dp = row + (final_width - 1) * pixel_bytes;
1458
// New code by Nirav Chhatrapati - Intel Corporation
1460
if (mmx_supported) // use MMX code if machine supports it
1462
//--------------------------------------------------------------
1463
if (pixel_bytes == 3)
1465
if (((pass == 0) || (pass == 1)) && width)
1467
int dummy_value_c; // fix 'forbidden register spilled'
1470
__asm__ __volatile__ (
1471
"subl $21, %%edi \n\t"
1472
// (png_pass_inc[pass] - 1)*pixel_bytes
1474
".loop3_pass0: \n\t"
1475
"movd (%%esi), %%mm0 \n\t" // x x x x x 2 1 0
1476
"pand _const4, %%mm0 \n\t" // z z z z z 2 1 0
1477
"movq %%mm0, %%mm1 \n\t" // z z z z z 2 1 0
1478
"psllq $16, %%mm0 \n\t" // z z z 2 1 0 z z
1479
"movq %%mm0, %%mm2 \n\t" // z z z 2 1 0 z z
1480
"psllq $24, %%mm0 \n\t" // 2 1 0 z z z z z
1481
"psrlq $8, %%mm1 \n\t" // z z z z z z 2 1
1482
"por %%mm2, %%mm0 \n\t" // 2 1 0 2 1 0 z z
1483
"por %%mm1, %%mm0 \n\t" // 2 1 0 2 1 0 2 1
1484
"movq %%mm0, %%mm3 \n\t" // 2 1 0 2 1 0 2 1
1485
"psllq $16, %%mm0 \n\t" // 0 2 1 0 2 1 z z
1486
"movq %%mm3, %%mm4 \n\t" // 2 1 0 2 1 0 2 1
1487
"punpckhdq %%mm0, %%mm3 \n\t" // 0 2 1 0 2 1 0 2
1488
"movq %%mm4, 16(%%edi) \n\t"
1489
"psrlq $32, %%mm0 \n\t" // z z z z 0 2 1 0
1490
"movq %%mm3, 8(%%edi) \n\t"
1491
"punpckldq %%mm4, %%mm0 \n\t" // 1 0 2 1 0 2 1 0
1492
"subl $3, %%esi \n\t"
1493
"movq %%mm0, (%%edi) \n\t"
1494
"subl $24, %%edi \n\t"
1496
"jnz .loop3_pass0 \n\t"
1499
: "=c" (dummy_value_c), // output regs (dummy)
1500
"=S" (dummy_value_S),
1501
"=D" (dummy_value_D)
1503
: "1" (sptr), // esi // input regs
1506
// doesn't work "i" (0x0000000000FFFFFFLL) // %1 (a.k.a. _const4)
1508
// : // clobber list
1509
#if 0 /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
1510
: "%mm0", "%mm1", "%mm2", "%mm3", "%mm4"
1514
else if (((pass == 2) || (pass == 3)) && width)
1516
int dummy_value_c; // fix 'forbidden register spilled'
1519
__asm__ __volatile__ (
1520
"subl $9, %%edi \n\t"
1521
// (png_pass_inc[pass] - 1)*pixel_bytes
1523
".loop3_pass2: \n\t"
1524
"movd (%%esi), %%mm0 \n\t" // x x x x x 2 1 0
1525
"pand _const4, %%mm0 \n\t" // z z z z z 2 1 0
1526
"movq %%mm0, %%mm1 \n\t" // z z z z z 2 1 0
1527
"psllq $16, %%mm0 \n\t" // z z z 2 1 0 z z
1528
"movq %%mm0, %%mm2 \n\t" // z z z 2 1 0 z z
1529
"psllq $24, %%mm0 \n\t" // 2 1 0 z z z z z
1530
"psrlq $8, %%mm1 \n\t" // z z z z z z 2 1
1531
"por %%mm2, %%mm0 \n\t" // 2 1 0 2 1 0 z z
1532
"por %%mm1, %%mm0 \n\t" // 2 1 0 2 1 0 2 1
1533
"movq %%mm0, 4(%%edi) \n\t"
1534
"psrlq $16, %%mm0 \n\t" // z z 2 1 0 2 1 0
1535
"subl $3, %%esi \n\t"
1536
"movd %%mm0, (%%edi) \n\t"
1537
"subl $12, %%edi \n\t"
1539
"jnz .loop3_pass2 \n\t"
1542
: "=c" (dummy_value_c), // output regs (dummy)
1543
"=S" (dummy_value_S),
1544
"=D" (dummy_value_D)
1546
: "1" (sptr), // esi // input regs
1550
// : // clobber list
1551
#if 0 /* %mm0, ..., %mm2 not supported by gcc 2.7.2.3 or egcs 1.1 */
1552
: "%mm0", "%mm1", "%mm2"
1556
else if (width) /* && ((pass == 4) || (pass == 5)) */
1558
int width_mmx = ((width >> 1) << 1) - 8; // GRR: huh?
1561
width -= width_mmx; // 8 or 9 pix, 24 or 27 bytes
1564
// png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
1565
// sptr points at last pixel in pre-expanded row
1566
// dp points at last pixel position in expanded row
1567
int dummy_value_c; // fix 'forbidden register spilled'
1570
__asm__ __volatile__ (
1571
"subl $3, %%esi \n\t"
1572
"subl $9, %%edi \n\t"
1573
// (png_pass_inc[pass] + 1)*pixel_bytes
1575
".loop3_pass4: \n\t"
1576
"movq (%%esi), %%mm0 \n\t" // x x 5 4 3 2 1 0
1577
"movq %%mm0, %%mm1 \n\t" // x x 5 4 3 2 1 0
1578
"movq %%mm0, %%mm2 \n\t" // x x 5 4 3 2 1 0
1579
"psllq $24, %%mm0 \n\t" // 4 3 2 1 0 z z z
1580
"pand _const4, %%mm1 \n\t" // z z z z z 2 1 0
1581
"psrlq $24, %%mm2 \n\t" // z z z x x 5 4 3
1582
"por %%mm1, %%mm0 \n\t" // 4 3 2 1 0 2 1 0
1583
"movq %%mm2, %%mm3 \n\t" // z z z x x 5 4 3
1584
"psllq $8, %%mm2 \n\t" // z z x x 5 4 3 z
1585
"movq %%mm0, (%%edi) \n\t"
1586
"psrlq $16, %%mm3 \n\t" // z z z z z x x 5
1587
"pand _const6, %%mm3 \n\t" // z z z z z z z 5
1588
"por %%mm3, %%mm2 \n\t" // z z x x 5 4 3 5
1589
"subl $6, %%esi \n\t"
1590
"movd %%mm2, 8(%%edi) \n\t"
1591
"subl $12, %%edi \n\t"
1592
"subl $2, %%ecx \n\t"
1593
"jnz .loop3_pass4 \n\t"
1596
: "=c" (dummy_value_c), // output regs (dummy)
1597
"=S" (dummy_value_S),
1598
"=D" (dummy_value_D)
1600
: "1" (sptr), // esi // input regs
1602
"0" (width_mmx) // ecx
1604
// : // clobber list
1605
#if 0 /* %mm0, ..., %mm3 not supported by gcc 2.7.2.3 or egcs 1.1 */
1606
: "%mm0", "%mm1", "%mm2", "%mm3"
1611
sptr -= width_mmx*3;
1613
for (i = width; i; i--)
1618
png_memcpy(v, sptr, 3);
1619
for (j = 0; j < png_pass_inc[pass]; j++)
1621
png_memcpy(dp, v, 3);
1627
} /* end of pixel_bytes == 3 */
1629
//--------------------------------------------------------------
1630
else if (pixel_bytes == 1)
1632
if (((pass == 0) || (pass == 1)) && width)
1634
int width_mmx = ((width >> 2) << 2);
1635
width -= width_mmx; // 0-3 pixels => 0-3 bytes
1638
int dummy_value_c; // fix 'forbidden register spilled'
1641
__asm__ __volatile__ (
1642
"subl $3, %%esi \n\t"
1643
"subl $31, %%edi \n\t"
1645
".loop1_pass0: \n\t"
1646
"movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
1647
"movq %%mm0, %%mm1 \n\t" // x x x x 3 2 1 0
1648
"punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0
1649
"movq %%mm0, %%mm2 \n\t" // 3 3 2 2 1 1 0 0
1650
"punpcklwd %%mm0, %%mm0 \n\t" // 1 1 1 1 0 0 0 0
1651
"movq %%mm0, %%mm3 \n\t" // 1 1 1 1 0 0 0 0
1652
"punpckldq %%mm0, %%mm0 \n\t" // 0 0 0 0 0 0 0 0
1653
"punpckhdq %%mm3, %%mm3 \n\t" // 1 1 1 1 1 1 1 1
1654
"movq %%mm0, (%%edi) \n\t"
1655
"punpckhwd %%mm2, %%mm2 \n\t" // 3 3 3 3 2 2 2 2
1656
"movq %%mm3, 8(%%edi) \n\t"
1657
"movq %%mm2, %%mm4 \n\t" // 3 3 3 3 2 2 2 2
1658
"punpckldq %%mm2, %%mm2 \n\t" // 2 2 2 2 2 2 2 2
1659
"punpckhdq %%mm4, %%mm4 \n\t" // 3 3 3 3 3 3 3 3
1660
"movq %%mm2, 16(%%edi) \n\t"
1661
"subl $4, %%esi \n\t"
1662
"movq %%mm4, 24(%%edi) \n\t"
1663
"subl $32, %%edi \n\t"
1664
"subl $4, %%ecx \n\t"
1665
"jnz .loop1_pass0 \n\t"
1668
: "=c" (dummy_value_c), // output regs (dummy)
1669
"=S" (dummy_value_S),
1670
"=D" (dummy_value_D)
1672
: "1" (sptr), // esi // input regs
1674
"0" (width_mmx) // ecx
1676
// : // clobber list
1677
#if 0 /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
1678
: "%mm0", "%mm1", "%mm2", "%mm3", "%mm4"
1685
for (i = width; i; i--)
1689
/* I simplified this part in version 1.0.4e
1690
* here and in several other instances where
1691
* pixel_bytes == 1 -- GR-P
1696
* png_memcpy(v, sptr, pixel_bytes);
1697
* for (j = 0; j < png_pass_inc[pass]; j++)
1699
* png_memcpy(dp, v, pixel_bytes);
1700
* dp -= pixel_bytes;
1702
* sptr -= pixel_bytes;
1704
* Replacement code is in the next three lines:
1707
for (j = 0; j < png_pass_inc[pass]; j++)
1712
else if (((pass == 2) || (pass == 3)) && width)
1714
int width_mmx = ((width >> 2) << 2);
1715
width -= width_mmx; // 0-3 pixels => 0-3 bytes
1718
int dummy_value_c; // fix 'forbidden register spilled'
1721
__asm__ __volatile__ (
1722
"subl $3, %%esi \n\t"
1723
"subl $15, %%edi \n\t"
1725
".loop1_pass2: \n\t"
1726
"movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
1727
"punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0
1728
"movq %%mm0, %%mm1 \n\t" // 3 3 2 2 1 1 0 0
1729
"punpcklwd %%mm0, %%mm0 \n\t" // 1 1 1 1 0 0 0 0
1730
"punpckhwd %%mm1, %%mm1 \n\t" // 3 3 3 3 2 2 2 2
1731
"movq %%mm0, (%%edi) \n\t"
1732
"subl $4, %%esi \n\t"
1733
"movq %%mm1, 8(%%edi) \n\t"
1734
"subl $16, %%edi \n\t"
1735
"subl $4, %%ecx \n\t"
1736
"jnz .loop1_pass2 \n\t"
1739
: "=c" (dummy_value_c), // output regs (dummy)
1740
"=S" (dummy_value_S),
1741
"=D" (dummy_value_D)
1743
: "1" (sptr), // esi // input regs
1745
"0" (width_mmx) // ecx
1747
// : // clobber list
1748
#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
1756
for (i = width; i; i--)
1760
for (j = 0; j < png_pass_inc[pass]; j++)
1765
else if (width) /* && ((pass == 4) || (pass == 5)) */
1767
int width_mmx = ((width >> 3) << 3);
1768
width -= width_mmx; // 0-3 pixels => 0-3 bytes
1771
int dummy_value_c; // fix 'forbidden register spilled'
1774
__asm__ __volatile__ (
1775
"subl $7, %%esi \n\t"
1776
"subl $15, %%edi \n\t"
1778
".loop1_pass4: \n\t"
1779
"movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
1780
"movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
1781
"punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0
1782
"punpckhbw %%mm1, %%mm1 \n\t" // 7 7 6 6 5 5 4 4
1783
"movq %%mm1, 8(%%edi) \n\t"
1784
"subl $8, %%esi \n\t"
1785
"movq %%mm0, (%%edi) \n\t"
1786
"subl $16, %%edi \n\t"
1787
"subl $8, %%ecx \n\t"
1788
"jnz .loop1_pass4 \n\t"
1791
: "=c" (dummy_value_c), // output regs (none)
1792
"=S" (dummy_value_S),
1793
"=D" (dummy_value_D)
1795
: "1" (sptr), // esi // input regs
1797
"0" (width_mmx) // ecx
1799
// : // clobber list
1800
#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
1808
for (i = width; i; i--)
1812
for (j = 0; j < png_pass_inc[pass]; j++)
1817
} /* end of pixel_bytes == 1 */
1819
//--------------------------------------------------------------
1820
else if (pixel_bytes == 2)
1822
if (((pass == 0) || (pass == 1)) && width)
1824
int width_mmx = ((width >> 1) << 1);
1825
width -= width_mmx; // 0,1 pixels => 0,2 bytes
1828
int dummy_value_c; // fix 'forbidden register spilled'
1831
__asm__ __volatile__ (
1832
"subl $2, %%esi \n\t"
1833
"subl $30, %%edi \n\t"
1835
".loop2_pass0: \n\t"
1836
"movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
1837
"punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0
1838
"movq %%mm0, %%mm1 \n\t" // 3 2 3 2 1 0 1 0
1839
"punpckldq %%mm0, %%mm0 \n\t" // 1 0 1 0 1 0 1 0
1840
"punpckhdq %%mm1, %%mm1 \n\t" // 3 2 3 2 3 2 3 2
1841
"movq %%mm0, (%%edi) \n\t"
1842
"movq %%mm0, 8(%%edi) \n\t"
1843
"movq %%mm1, 16(%%edi) \n\t"
1844
"subl $4, %%esi \n\t"
1845
"movq %%mm1, 24(%%edi) \n\t"
1846
"subl $32, %%edi \n\t"
1847
"subl $2, %%ecx \n\t"
1848
"jnz .loop2_pass0 \n\t"
1851
: "=c" (dummy_value_c), // output regs (dummy)
1852
"=S" (dummy_value_S),
1853
"=D" (dummy_value_D)
1855
: "1" (sptr), // esi // input regs
1857
"0" (width_mmx) // ecx
1859
// : // clobber list
1860
#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
1866
sptr -= (width_mmx*2 - 2); // sign fixed
1867
dp -= (width_mmx*16 - 2); // sign fixed
1868
for (i = width; i; i--)
1873
png_memcpy(v, sptr, 2);
1874
for (j = 0; j < png_pass_inc[pass]; j++)
1877
png_memcpy(dp, v, 2);
1881
else if (((pass == 2) || (pass == 3)) && width)
1883
int width_mmx = ((width >> 1) << 1) ;
1884
width -= width_mmx; // 0,1 pixels => 0,2 bytes
1887
int dummy_value_c; // fix 'forbidden register spilled'
1890
__asm__ __volatile__ (
1891
"subl $2, %%esi \n\t"
1892
"subl $14, %%edi \n\t"
1894
".loop2_pass2: \n\t"
1895
"movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
1896
"punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0
1897
"movq %%mm0, %%mm1 \n\t" // 3 2 3 2 1 0 1 0
1898
"punpckldq %%mm0, %%mm0 \n\t" // 1 0 1 0 1 0 1 0
1899
"punpckhdq %%mm1, %%mm1 \n\t" // 3 2 3 2 3 2 3 2
1900
"movq %%mm0, (%%edi) \n\t"
1901
"subl $4, %%esi \n\t"
1902
"movq %%mm1, 8(%%edi) \n\t"
1903
"subl $16, %%edi \n\t"
1904
"subl $2, %%ecx \n\t"
1905
"jnz .loop2_pass2 \n\t"
1908
: "=c" (dummy_value_c), // output regs (dummy)
1909
"=S" (dummy_value_S),
1910
"=D" (dummy_value_D)
1912
: "1" (sptr), // esi // input regs
1914
"0" (width_mmx) // ecx
1916
// : // clobber list
1917
#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
1923
sptr -= (width_mmx*2 - 2); // sign fixed
1924
dp -= (width_mmx*8 - 2); // sign fixed
1925
for (i = width; i; i--)
1930
png_memcpy(v, sptr, 2);
1931
for (j = 0; j < png_pass_inc[pass]; j++)
1934
png_memcpy(dp, v, 2);
1938
else if (width) // pass == 4 or 5
1940
int width_mmx = ((width >> 1) << 1) ;
1941
width -= width_mmx; // 0,1 pixels => 0,2 bytes
1944
int dummy_value_c; // fix 'forbidden register spilled'
1947
__asm__ __volatile__ (
1948
"subl $2, %%esi \n\t"
1949
"subl $6, %%edi \n\t"
1951
".loop2_pass4: \n\t"
1952
"movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
1953
"punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0
1954
"subl $4, %%esi \n\t"
1955
"movq %%mm0, (%%edi) \n\t"
1956
"subl $8, %%edi \n\t"
1957
"subl $2, %%ecx \n\t"
1958
"jnz .loop2_pass4 \n\t"
1961
: "=c" (dummy_value_c), // output regs (dummy)
1962
"=S" (dummy_value_S),
1963
"=D" (dummy_value_D)
1965
: "1" (sptr), // esi // input regs
1967
"0" (width_mmx) // ecx
1969
// : // clobber list
1970
#if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
1976
sptr -= (width_mmx*2 - 2); // sign fixed
1977
dp -= (width_mmx*4 - 2); // sign fixed
1978
for (i = width; i; i--)
1983
png_memcpy(v, sptr, 2);
1984
for (j = 0; j < png_pass_inc[pass]; j++)
1987
png_memcpy(dp, v, 2);
1991
} /* end of pixel_bytes == 2 */
1993
//--------------------------------------------------------------
1994
else if (pixel_bytes == 4)
1996
if (((pass == 0) || (pass == 1)) && width)
1998
int width_mmx = ((width >> 1) << 1);
1999
width -= width_mmx; // 0,1 pixels => 0,4 bytes
2001
fprintf(stderr, "GRR DEBUG: png_do_read_interlace() pass = %d, width_mmx = %d, width = %d\n", pass, width_mmx, width);
2002
fprintf(stderr, " sptr = 0x%08lx, dp = 0x%08lx\n", (unsigned long)sptr, (unsigned long)dp);
2007
int dummy_value_c; // fix 'forbidden register spilled'
2011
FILE *junk = fopen("junk.4bytes", "wb");
2014
#endif /* GRR_DEBUG */
2015
__asm__ __volatile__ (
2016
"subl $4, %%esi \n\t"
2017
"subl $60, %%edi \n\t"
2019
".loop4_pass0: \n\t"
2020
"movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2021
"movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
2022
"punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
2023
"punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
2024
"movq %%mm0, (%%edi) \n\t"
2025
"movq %%mm0, 8(%%edi) \n\t"
2026
"movq %%mm0, 16(%%edi) \n\t"
2027
"movq %%mm0, 24(%%edi) \n\t"
2028
"movq %%mm1, 32(%%edi) \n\t"
2029
"movq %%mm1, 40(%%edi) \n\t"
2030
"movq %%mm1, 48(%%edi) \n\t"
2031
"subl $8, %%esi \n\t"
2032
"movq %%mm1, 56(%%edi) \n\t"
2033
"subl $64, %%edi \n\t"
2034
"subl $2, %%ecx \n\t"
2035
"jnz .loop4_pass0 \n\t"
2038
: "=c" (dummy_value_c), // output regs (dummy)
2039
"=S" (dummy_value_S),
2040
"=D" (dummy_value_D)
2042
: "1" (sptr), // esi // input regs
2044
"0" (width_mmx) // ecx
2046
// : // clobber list
2047
#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2053
sptr -= (width_mmx*4 - 4); // sign fixed
2054
dp -= (width_mmx*32 - 4); // sign fixed
2055
for (i = width; i; i--)
2060
png_memcpy(v, sptr, 4);
2061
for (j = 0; j < png_pass_inc[pass]; j++)
2064
png_memcpy(dp, v, 4);
2068
else if (((pass == 2) || (pass == 3)) && width)
2070
int width_mmx = ((width >> 1) << 1);
2071
width -= width_mmx; // 0,1 pixels => 0,4 bytes
2074
int dummy_value_c; // fix 'forbidden register spilled'
2077
__asm__ __volatile__ (
2078
"subl $4, %%esi \n\t"
2079
"subl $28, %%edi \n\t"
2081
".loop4_pass2: \n\t"
2082
"movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2083
"movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
2084
"punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
2085
"punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
2086
"movq %%mm0, (%%edi) \n\t"
2087
"movq %%mm0, 8(%%edi) \n\t"
2088
"movq %%mm1, 16(%%edi) \n\t"
2089
"movq %%mm1, 24(%%edi) \n\t"
2090
"subl $8, %%esi \n\t"
2091
"subl $32, %%edi \n\t"
2092
"subl $2, %%ecx \n\t"
2093
"jnz .loop4_pass2 \n\t"
2096
: "=c" (dummy_value_c), // output regs (dummy)
2097
"=S" (dummy_value_S),
2098
"=D" (dummy_value_D)
2100
: "1" (sptr), // esi // input regs
2102
"0" (width_mmx) // ecx
2104
// : // clobber list
2105
#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2111
sptr -= (width_mmx*4 - 4); // sign fixed
2112
dp -= (width_mmx*16 - 4); // sign fixed
2113
for (i = width; i; i--)
2118
png_memcpy(v, sptr, 4);
2119
for (j = 0; j < png_pass_inc[pass]; j++)
2122
png_memcpy(dp, v, 4);
2126
else if (width) // pass == 4 or 5
2128
int width_mmx = ((width >> 1) << 1) ;
2129
width -= width_mmx; // 0,1 pixels => 0,4 bytes
2132
int dummy_value_c; // fix 'forbidden register spilled'
2135
__asm__ __volatile__ (
2136
"subl $4, %%esi \n\t"
2137
"subl $12, %%edi \n\t"
2139
".loop4_pass4: \n\t"
2140
"movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2141
"movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
2142
"punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
2143
"punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
2144
"movq %%mm0, (%%edi) \n\t"
2145
"subl $8, %%esi \n\t"
2146
"movq %%mm1, 8(%%edi) \n\t"
2147
"subl $16, %%edi \n\t"
2148
"subl $2, %%ecx \n\t"
2149
"jnz .loop4_pass4 \n\t"
2152
: "=c" (dummy_value_c), // output regs (dummy)
2153
"=S" (dummy_value_S),
2154
"=D" (dummy_value_D)
2156
: "1" (sptr), // esi // input regs
2158
"0" (width_mmx) // ecx
2160
// : // clobber list
2161
#if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2167
sptr -= (width_mmx*4 - 4); // sign fixed
2168
dp -= (width_mmx*8 - 4); // sign fixed
2169
for (i = width; i; i--)
2174
png_memcpy(v, sptr, 4);
2175
for (j = 0; j < png_pass_inc[pass]; j++)
2178
png_memcpy(dp, v, 4);
2182
} /* end of pixel_bytes == 4 */
2184
#define STILL_WORKING_ON_THIS
2185
#ifdef STILL_WORKING_ON_THIS // GRR: should work, but needs testing
2186
// (special 64-bit version of rpng2)
2188
//--------------------------------------------------------------
2189
else if (pixel_bytes == 8)
2191
// GRR NOTE: no need to combine passes here!
2192
if (((pass == 0) || (pass == 1)) && width)
2194
// source is 8-byte RRGGBBAA
2195
// dest is 64-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA ...
2196
int dummy_value_c; // fix 'forbidden register spilled'
2200
FILE *junk = fopen("junk.8bytes", "wb");
2203
#endif /* GRR_DEBUG */
2204
__asm__ __volatile__ (
2205
"subl $56, %%edi \n\t" // start of last block
2207
".loop8_pass0: \n\t"
2208
"movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2209
"movq %%mm0, (%%edi) \n\t"
2210
"movq %%mm0, 8(%%edi) \n\t"
2211
"movq %%mm0, 16(%%edi) \n\t"
2212
"movq %%mm0, 24(%%edi) \n\t"
2213
"movq %%mm0, 32(%%edi) \n\t"
2214
"movq %%mm0, 40(%%edi) \n\t"
2215
"movq %%mm0, 48(%%edi) \n\t"
2216
"subl $8, %%esi \n\t"
2217
"movq %%mm0, 56(%%edi) \n\t"
2218
"subl $64, %%edi \n\t"
2220
"jnz .loop8_pass0 \n\t"
2223
: "=c" (dummy_value_c), // output regs (dummy)
2224
"=S" (dummy_value_S),
2225
"=D" (dummy_value_D)
2227
: "1" (sptr), // esi // input regs
2231
// : // clobber list
2232
#if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2237
else if (((pass == 2) || (pass == 3)) && width)
2239
// source is 8-byte RRGGBBAA
2240
// dest is 32-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA
2241
int width_mmx = ((width >> 1) << 1) ;
2245
int dummy_value_c; // fix 'forbidden register spilled'
2248
__asm__ __volatile__ (
2249
"subl $24, %%edi \n\t" // start of last block
2251
".loop8_pass2: \n\t"
2252
"movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2253
"movq %%mm0, (%%edi) \n\t"
2254
"movq %%mm0, 8(%%edi) \n\t"
2255
"movq %%mm0, 16(%%edi) \n\t"
2256
"subl $8, %%esi \n\t"
2257
"movq %%mm0, 24(%%edi) \n\t"
2258
"subl $32, %%edi \n\t"
2260
"jnz .loop8_pass2 \n\t"
2263
: "=c" (dummy_value_c), // output regs (dummy)
2264
"=S" (dummy_value_S),
2265
"=D" (dummy_value_D)
2267
: "1" (sptr), // esi // input regs
2271
// : // clobber list
2272
#if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2278
else if (width) // pass == 4 or 5
2280
// source is 8-byte RRGGBBAA
2281
// dest is 16-byte RRGGBBAA RRGGBBAA
2282
int width_mmx = ((width >> 1) << 1) ;
2286
int dummy_value_c; // fix 'forbidden register spilled'
2289
__asm__ __volatile__ (
2290
"subl $8, %%edi \n\t" // start of last block
2292
".loop8_pass4: \n\t"
2293
"movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2294
"movq %%mm0, (%%edi) \n\t"
2295
"subl $8, %%esi \n\t"
2296
"movq %%mm0, 8(%%edi) \n\t"
2297
"subl $16, %%edi \n\t"
2299
"jnz .loop8_pass4 \n\t"
2302
: "=c" (dummy_value_c), // output regs (dummy)
2303
"=S" (dummy_value_S),
2304
"=D" (dummy_value_D)
2306
: "1" (sptr), // esi // input regs
2310
// : // clobber list
2311
#if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2318
} /* end of pixel_bytes == 8 */
2320
#endif /* STILL_WORKING_ON_THIS */
2322
//--------------------------------------------------------------
2323
else if (pixel_bytes == 6)
2325
for (i = width; i; i--)
2329
png_memcpy(v, sptr, 6);
2330
for (j = 0; j < png_pass_inc[pass]; j++)
2332
png_memcpy(dp, v, 6);
2337
} /* end of pixel_bytes == 6 */
2339
//--------------------------------------------------------------
2342
for (i = width; i; i--)
2346
png_memcpy(v, sptr, pixel_bytes);
2347
for (j = 0; j < png_pass_inc[pass]; j++)
2349
png_memcpy(dp, v, pixel_bytes);
2355
} // end of mmx_supported =========================================
2357
else /* MMX not supported: use modified C code - takes advantage
2358
* of inlining of memcpy for a constant */
2359
/* GRR 19991007: does it? or should pixel_bytes in each
2360
* block be replaced with immediate value (e.g., 1)? */
2361
/* GRR 19991017: replaced with constants in each case */
2363
if (pixel_bytes == 1)
2365
for (i = width; i; i--)
2368
for (j = 0; j < png_pass_inc[pass]; j++)
2373
else if (pixel_bytes == 3)
2375
for (i = width; i; i--)
2379
png_memcpy(v, sptr, 3);
2380
for (j = 0; j < png_pass_inc[pass]; j++)
2382
png_memcpy(dp, v, 3);
2388
else if (pixel_bytes == 2)
2390
for (i = width; i; i--)
2394
png_memcpy(v, sptr, 2);
2395
for (j = 0; j < png_pass_inc[pass]; j++)
2397
png_memcpy(dp, v, 2);
2403
else if (pixel_bytes == 4)
2405
for (i = width; i; i--)
2409
png_memcpy(v, sptr, 4);
2410
for (j = 0; j < png_pass_inc[pass]; j++)
2412
png_memcpy(dp, v, 4);
2418
else if (pixel_bytes == 6)
2420
for (i = width; i; i--)
2424
png_memcpy(v, sptr, 6);
2425
for (j = 0; j < png_pass_inc[pass]; j++)
2427
png_memcpy(dp, v, 6);
2433
else if (pixel_bytes == 8)
2435
for (i = width; i; i--)
2439
png_memcpy(v, sptr, 8);
2440
for (j = 0; j < png_pass_inc[pass]; j++)
2442
png_memcpy(dp, v, 8);
2448
else // GRR: should never be reached
2450
for (i = width; i; i--)
2454
png_memcpy(v, sptr, pixel_bytes);
2455
for (j = 0; j < png_pass_inc[pass]; j++)
2457
png_memcpy(dp, v, pixel_bytes);
2460
sptr -= pixel_bytes;
2464
} /* end if (MMX not supported) */
2467
} /* end switch (row_info->pixel_depth) */
2469
row_info->width = final_width;
2470
row_info->rowbytes = ((final_width *
2471
(png_uint_32)row_info->pixel_depth + 7) >> 3);
2474
} /* end png_do_read_interlace() */
2476
#endif /* PNG_HAVE_ASSEMBLER_READ_INTERLACE */
2477
#endif /* PNG_READ_INTERLACING_SUPPORTED */
2480
// These variables are utilized in the functions below. They are declared
2481
// globally here to ensure alignment on 8-byte boundaries.
2486
} LBCarryMask = {0x0101010101010101LL},
2487
HBClearMask = {0x7f7f7f7f7f7f7f7fLL},
2488
ActiveMask, ActiveMask2, ActiveMaskEnd, ShiftBpp, ShiftRem;
2491
// Optimized code for PNG Average filter decoder
2493
png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
2497
int dummy_value_c; // fix 'forbidden register 2 (cx) was spilled' error
2500
// int diff; GRR: global now (shortened to dif/_dif)
2502
bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
2503
_FullLength = row_info->rowbytes; // # of bytes to filter
2504
__asm__ __volatile__ (
2505
// Init address pointers and offset
2506
//GRR "movl row, %%edi \n\t" // edi ==> Avg(x)
2507
"xorl %%ebx, %%ebx \n\t" // ebx ==> x
2508
"movl %%edi, %%edx \n\t"
2509
//GRR "movl prev_row, %%esi \n\t" // esi ==> Prior(x)
2510
//GRR "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
2511
"subl %%ecx, %%edx \n\t" // edx ==> Raw(x-bpp)
2513
"xorl %%eax,%%eax \n\t"
2515
// Compute the Raw value for the first bpp bytes
2516
// Raw(x) = Avg(x) + (Prior(x)/2)
2518
"movb (%%esi,%%ebx,),%%al \n\t" // Load al with Prior(x)
2520
"shrb %%al \n\t" // divide by 2
2521
"addb -1(%%edi,%%ebx,),%%al \n\t" // add Avg(x); -1 to offset inc ebx
2522
//GRR "cmpl bpp, %%ebx \n\t" // (bpp is preloaded into ecx)
2523
"cmpl %%ecx, %%ebx \n\t"
2524
"movb %%al,-1(%%edi,%%ebx,) \n\t" // write Raw(x); -1 to offset inc ebx
2525
"jb avg_rlp \n\t" // mov does not affect flags
2527
// get # of bytes to alignment
2528
"movl %%edi, _dif \n\t" // take start of row
2529
"addl %%ebx, _dif \n\t" // add bpp
2530
"addl $0xf, _dif \n\t" // add 7+8 to incr past alignment bdry
2531
"andl $0xfffffff8, _dif \n\t" // mask to alignment boundary
2532
"subl %%edi, _dif \n\t" // subtract from start => value ebx at alignment
2536
// Compute the Raw value for the bytes up to the alignment boundary
2537
// Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2538
"xorl %%ecx, %%ecx \n\t"
2540
"xorl %%eax, %%eax \n\t"
2541
"movb (%%esi,%%ebx,), %%cl \n\t" // load cl with Prior(x)
2542
"movb (%%edx,%%ebx,), %%al \n\t" // load al with Raw(x-bpp)
2543
"addw %%cx, %%ax \n\t"
2545
"shrw %%ax \n\t" // divide by 2
2546
"addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
2547
"cmpl _dif, %%ebx \n\t" // check if at alignment boundary
2548
"movb %%al, -1(%%edi,%%ebx,) \n\t" // write Raw(x); -1 to offset inc ebx
2549
"jb avg_lp1 \n\t" // repeat until at alignment boundary
2552
"movl _FullLength, %%eax \n\t"
2553
"movl %%eax, %%ecx \n\t"
2554
"subl %%ebx, %%eax \n\t" // subtract alignment fix
2555
"andl $0x00000007, %%eax \n\t" // calc bytes over mult of 8
2556
"subl %%eax, %%ecx \n\t" // drop over bytes from original length
2557
"movl %%ecx, _MMXLength \n\t"
2559
: "=c" (dummy_value_c), // output regs/vars here, e.g., "=m" (_MMXLength) instead of final instr
2560
"=S" (dummy_value_S),
2561
"=D" (dummy_value_D)
2563
: "1" (prev_row), // esi // input regs
2567
: "%eax", "%ebx", // clobber list
2569
// GRR: INCLUDE "memory" as clobbered? (_dif, _MMXLength) PROBABLY
2572
#ifdef GRR_GCC_MMX_CONVERTED
2573
// Now do the math for the rest of the row
2578
ActiveMask.use = 0x0000000000ffffff;
2579
ShiftBpp.use = 24; // == 3 * 8
2580
ShiftRem.use = 40; // == 64 - 24
2582
// Re-init address pointers and offset
2583
"movq $ActiveMask, %%mm7 \n\t"
2584
"movl _dif, %%ebx \n\t" // ebx ==> x = offset to alignment boundary
2585
"movq $LBCarryMask, %%mm5 \n\t"
2586
"movl row, %%edi \n\t" // edi ==> Avg(x)
2587
"movq $HBClearMask, %%mm4 \n\t"
2588
"movl prev_row, %%esi \n\t" // esi ==> Prior(x)
2589
// PRIME the pump (load the first Raw(x-bpp) data set)
2590
"movq -8(%%edi,%%ebx,), %%mm2 \n\t" // Load previous aligned 8 bytes
2591
// (we correct position in loop below)
2593
"movq (%%edi,%%ebx,), %%mm0 \n\t" // Load mm0 with Avg(x)
2594
// Add (Prev_row/2) to Average
2595
"movq %%mm5, %%mm3 \n\t"
2596
"psrlq $ShiftRem, %%mm2 \n\t" // Correct position Raw(x-bpp) data
2597
"movq (%%esi,%%ebx,), %%mm1 \n\t" // Load mm1 with Prior(x)
2598
"movq %%mm7, %%mm6 \n\t"
2599
"pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
2600
"psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
2601
"pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each byte
2602
"paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for each byte
2603
// Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
2604
"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting LBCarrys
2605
"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte where both
2606
// lsb's were == 1 (Only valid for active group)
2607
"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2608
"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each byte
2609
"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
2610
"pand %%mm6, %%mm2 \n\t" // Leave only Active Group 1 bytes to add to Avg
2611
"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg for each Active
2613
// Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
2614
"psllq $ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover bytes 3-5
2615
"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
2616
"psllq $ShiftBpp, %%mm2 \n\t" // shift data to position correctly
2617
"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting LBCarrys
2618
"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte where both
2619
// lsb's were == 1 (Only valid for active group)
2620
"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2621
"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each byte
2622
"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
2623
"pand %%mm6, %%mm2 \n\t" // Leave only Active Group 2 bytes to add to Avg
2624
"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg for each Active
2627
// Add 3rd active group (Raw(x-bpp)/2) to Average with LBCarry
2628
"psllq $ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover the last two
2630
"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
2631
"psllq $ShiftBpp, %%mm2 \n\t" // shift data to position correctly
2632
// Data only needs to be shifted once here to
2633
// get the correct x-bpp offset.
2634
"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting LBCarrys
2635
"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte where both
2636
// lsb's were == 1 (Only valid for active group)
2637
"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2638
"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each byte
2639
"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
2640
"pand %%mm6, %%mm2 \n\t" // Leave only Active Group 2 bytes to add to Avg
2641
"addl $8, %%ebx \n\t"
2642
"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg for each Active
2644
// Now ready to write back to memory
2645
"movq %%mm0, -8(%%edi,%%ebx,) \n\t"
2646
// Move updated Raw(x) to use as Raw(x-bpp) for next loop
2647
"cmpl _MMXLength, %%ebx \n\t"
2648
"movq %%mm0, %%mm2 \n\t" // mov updated Raw(x) to mm2
2651
: // output regs/vars go here, e.g.: "=m" (memory_var)
2653
: "S" (prev_row), // esi // input regs
2656
: "%ebx", "%edi", "%esi" // clobber list
2657
// GRR: INCLUDE "memory" as clobbered? (_dif, _MMXLength) PROBABLY
2658
// , "%mm0", "%mm1", "%mm2", "%mm3",
2659
// "%mm4", "%mm5", "%mm6", "%mm7"
2666
//case 7: // who wrote this? PNG doesn't support 5 or 7 bytes/pixel
2669
ActiveMask.use = 0xffffffffffffffff; // use shift below to clear
2670
// appropriate inactive bytes
2671
ShiftBpp.use = bpp << 3;
2672
ShiftRem.use = 64 - ShiftBpp.use;
2674
"movq $HBClearMask, %%mm4 \n\t"
2676
// Re-init address pointers and offset
2677
"movl _dif, %%ebx \n\t" // ebx ==> x = offset to alignment boundary
2679
// Load ActiveMask and clear all bytes except for 1st active group
2680
"movq $ActiveMask, %%mm7 \n\t"
2681
"movl row, %%edi \n\t" // edi ==> Avg(x)
2682
"psrlq $ShiftRem, %%mm7 \n\t"
2683
"movl prev_row, %%esi \n\t" // esi ==> Prior(x)
2684
"movq %%mm7, %%mm6 \n\t"
2685
"movq $LBCarryMask, %%mm5 \n\t"
2686
"psllq $ShiftBpp, %%mm6 \n\t" // Create mask for 2nd active group
2688
// PRIME the pump (load the first Raw(x-bpp) data set
2689
"movq -8(%%edi,%%ebx,), %%mm2 \n\t" // Load previous aligned 8 bytes
2690
// (we correct position in loop below)
2692
"movq (%%edi,%%ebx,), %%mm0 \n\t"
2693
"psrlq $ShiftRem, %%mm2 \n\t" // shift data to position correctly
2694
"movq (%%esi,%%ebx,), %%mm1 \n\t"
2695
// Add (Prev_row/2) to Average
2696
"movq %%mm5, %%mm3 \n\t"
2697
"pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
2698
"psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
2699
"pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each byte
2700
"paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for each byte
2701
// Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
2702
"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting LBCarrys
2703
"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte where both
2704
// lsb's were == 1 (Only valid for active group)
2705
"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2706
"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each byte
2707
"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
2708
"pand %%mm7, %%mm2 \n\t" // Leave only Active Group 1 bytes to add to Avg
2709
"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg for each Active
2711
// Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
2712
"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
2713
"psllq $ShiftBpp, %%mm2 \n\t" // shift data to position correctly
2714
"addl $8, %%ebx \n\t"
2715
"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting LBCarrys
2716
"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte where both
2717
// lsb's were == 1 (Only valid for active group)
2718
"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2719
"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each byte
2720
"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
2721
"pand %%mm6, %%mm2 \n\t" // Leave only Active Group 2 bytes to add to Avg
2722
"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg for each Active
2724
"cmpl _MMXLength, %%ebx \n\t"
2725
// Now ready to write back to memory
2726
"movq %%mm0, -8(%%edi,%%ebx,) \n\t"
2727
// Prep Raw(x-bpp) for next loop
2728
"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
2731
: // FIXASM: output regs/vars go here, e.g.: "=m" (memory_var)
2733
: // FIXASM: input regs, e.g.: "c" (count), "S" (src), "D" (dest)
2735
: "%ebx", "%edi", "%esi", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" // CHECKASM: clobber list
2738
break; // end 4,6 bpp
2742
ActiveMask.use = 0x000000000000ffff;
2743
ShiftBpp.use = 24; // == 3 * 8
2744
ShiftRem.use = 40; // == 64 - 24
2747
"movq $ActiveMask, %%mm7 \n\t"
2748
// Re-init address pointers and offset
2749
"movl _dif, %%ebx \n\t" // ebx ==> x = offset to alignment boundary
2750
"movq $LBCarryMask, %%mm5 \n\t"
2751
"movl row, %%edi \n\t" // edi ==> Avg(x)
2752
"movq $HBClearMask, %%mm4 \n\t"
2753
"movl prev_row, %%esi \n\t" // esi ==> Prior(x)
2754
// PRIME the pump (load the first Raw(x-bpp) data set
2755
"movq -8(%%edi,%%ebx,), %%mm2 \n\t" // Load previous aligned 8 bytes
2756
// (we correct position in loop below)
2758
"movq (%%edi,%%ebx,), %%mm0 \n\t"
2759
"psllq $ShiftRem, %%mm2 \n\t" // shift data to position correctly
2760
"movq (%%esi,%%ebx,), %%mm1 \n\t"
2761
// Add (Prev_row/2) to Average
2762
"movq %%mm5, %%mm3 \n\t"
2763
"pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
2764
"psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
2765
"pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each byte
2766
"movq %%mm7, %%mm6 \n\t"
2767
"paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for each byte
2768
// Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
2769
"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting LBCarrys
2770
"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte where both
2771
// lsb's were == 1 (Only valid for active group)
2772
"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2773
"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each byte
2774
"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
2775
"pand %%mm6, %%mm2 \n\t" // Leave only Active Group 1 bytes to add to Avg
2776
"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg for each Active byte
2777
// Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
2778
"psllq $ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover bytes 2 & 3
2779
"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
2780
"psllq $ShiftBpp, %%mm2 \n\t" // shift data to position correctly
2781
"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting LBCarrys
2782
"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte where both
2783
// lsb's were == 1 (Only valid for active group)
2784
"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2785
"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each byte
2786
"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
2787
"pand %%mm6, %%mm2 \n\t" // Leave only Active Group 2 bytes to add to Avg
2788
"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg for each Active byte
2790
// Add rdd active group (Raw(x-bpp)/2) to Average with LBCarry
2791
"psllq $ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover bytes 4 & 5
2792
"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
2793
"psllq $ShiftBpp, %%mm2 \n\t" // shift data to position correctly
2794
// Data only needs to be shifted once here to
2795
// get the correct x-bpp offset.
2796
"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting LBCarrys
2797
"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte where both
2798
// lsb's were == 1 (Only valid for active group)
2799
"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2800
"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each byte
2801
"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
2802
"pand %%mm6, %%mm2 \n\t" // Leave only Active Group 2 bytes to add to Avg
2803
"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg for each Active byte
2805
// Add 4th active group (Raw(x-bpp)/2) to Average with LBCarry
2806
"psllq $ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover bytes 6 & 7
2807
"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
2808
"psllq $ShiftBpp, %%mm2 \n\t" // shift data to position correctly
2809
// Data only needs to be shifted once here to
2810
// get the correct x-bpp offset.
2811
"addl $8, %%ebx \n\t"
2812
"movq %%mm3, %%mm1 \n\t" // now use mm1 for getting LBCarrys
2813
"pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte where both
2814
// lsb's were == 1 (Only valid for active group)
2815
"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2816
"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each byte
2817
"paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
2818
"pand %%mm6, %%mm2 \n\t" // Leave only Active Group 2 bytes to add to Avg
2819
"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg for each Active byte
2821
"cmpl _MMXLength, %%ebx \n\t"
2822
// Now ready to write back to memory
2823
"movq %%mm0, -8(%%edi,%%ebx,) \n\t"
2824
// Prep Raw(x-bpp) for next loop
2825
"movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
2828
: // FIXASM: output regs/vars go here, e.g.: "=m" (memory_var)
2830
: // FIXASM: input regs, e.g.: "c" (count), "S" (src), "D" (dest)
2832
: "%ebx", "%edi", "%esi", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" // CHECKASM: clobber list
2840
// Re-init address pointers and offset
2841
"movl _dif, %%ebx \n\t" // ebx ==> x = offset to alignment boundary
2842
"movl row, %%edi \n\t" // edi ==> Avg(x)
2843
"cmpl _FullLength, %%ebx \n\t" // Test if offset at end of array
2845
// Do Paeth decode for remaining bytes
2846
"movl prev_row, %%esi \n\t" // esi ==> Prior(x)
2847
"movl %%edi, %%edx \n\t"
2848
"xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx in loop below
2849
"subl bpp, %%edx \n\t" // edx ==> Raw(x-bpp)
2851
// Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2852
"xorl %%eax, %%eax \n\t"
2853
"movb (%%esi,%%ebx,), %%cl \n\t" // load cl with Prior(x)
2854
"movb (%%edx,%%ebx,), %%al \n\t" // load al with Raw(x-bpp)
2855
"addw %%cx, %%ax \n\t"
2857
"shrw %%ax \n\t" // divide by 2
2858
"addb -1(%%edi,%%ebx,), %%al \n\t" // Add Avg(x); -1 to offset inc ebx
2859
"cmpl _FullLength, %%ebx \n\t" // Check if at end of array
2860
"movb %%al, -1(%%edi,%%ebx,) \n\t" // Write back Raw(x);
2861
// mov does not affect flags; -1 to offset inc ebx
2865
: // FIXASM: output regs/vars go here, e.g.: "=m" (memory_var)
2867
: // FIXASM: input regs, e.g.: "c" (count), "S" (src), "D" (dest)
2869
: "%eax", "%ebx", "%ecx", "%edx", "%edi", "%esi" // CHECKASM: clobber list
2872
return; // end 1 bpp
2877
// Re-init address pointers and offset
2878
"movl _dif, %%ebx \n\t" // ebx ==> x = offset to alignment boundary
2879
"movq $LBCarryMask, %%mm5 \n\t"
2880
"movl row, %%edi \n\t" // edi ==> Avg(x)
2881
"movq $HBClearMask, %%mm4 \n\t"
2882
"movl prev_row, %%esi \n\t" // esi ==> Prior(x)
2883
// PRIME the pump (load the first Raw(x-bpp) data set
2884
"movq -8(%%edi,%%ebx,), %%mm2 \n\t" // Load previous aligned 8 bytes
2885
// (NO NEED to correct position in loop below)
2887
"movq (%%edi,%%ebx,), %%mm0 \n\t"
2888
"movq %%mm5, %%mm3 \n\t"
2889
"movq (%%esi,%%ebx,), %%mm1 \n\t"
2890
"addl $8, %%ebx \n\t"
2891
"pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
2892
"psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
2893
"pand %%mm2, %%mm3 \n\t" // get LBCarrys for each byte where both
2895
"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2896
"pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each byte
2897
"paddb %%mm3, %%mm0 \n\t" // add LBCarrys to Avg for each byte
2898
"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each byte
2899
"paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for each byte
2900
"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) to Avg for each byte
2901
"cmpl _MMXLength, %%ebx \n\t"
2902
"movq %%mm0, -8(%%edi,%%ebx,) \n\t"
2903
"movq %%mm0, %%mm2 \n\t" // reuse as Raw(x-bpp)
2906
: // FIXASM: output regs/vars go here, e.g.: "=m" (memory_var)
2908
: // FIXASM: input regs, e.g.: "c" (count), "S" (src), "D" (dest)
2910
: "%ebx", "%edi", "%esi", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5" // CHECKASM: clobber list
2915
default: // bpp greater than 8 (!= 1,2,3,4,6,8)
2918
GRR: PRINT ERROR HERE: SHOULD NEVER BE REACHED (unless smaller than 1?)
2921
"movq $LBCarryMask, %%mm5 \n\t"
2922
// Re-init address pointers and offset
2923
"movl _dif, %%ebx \n\t" // ebx ==> x = offset to alignment boundary
2924
"movl row, %%edi \n\t" // edi ==> Avg(x)
2925
"movq $HBClearMask, %%mm4 \n\t"
2926
"movl %%edi, %%edx \n\t"
2927
"movl prev_row, %%esi \n\t" // esi ==> Prior(x)
2928
"subl bpp, %%edx \n\t" // edx ==> Raw(x-bpp)
2930
"movq (%%edi,%%ebx,), %%mm0 \n\t"
2931
"movq %%mm5, %%mm3 \n\t"
2932
"movq (%%esi,%%ebx,), %%mm1 \n\t"
2933
"pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
2934
"movq (%%edx,%%ebx,), %%mm2 \n\t"
2935
"psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
2936
"pand %%mm2, %%mm3 \n\t" // get LBCarrys for each byte where both
2938
"psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2939
"pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each byte
2940
"paddb %%mm3, %%mm0 \n\t" // add LBCarrys to Avg for each byte
2941
"pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each byte
2942
"paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for each byte
2943
"addl $8, %%ebx \n\t"
2944
"paddb %%mm2, %%mm0 \n\t" // add (Raw/2) to Avg for each byte
2945
"cmpl _MMXLength, %%ebx \n\t"
2946
"movq %%mm0, -8(%%edi,%%ebx,) \n\t"
2949
: // FIXASM: output regs/vars go here, e.g.: "=m" (memory_var)
2951
: // FIXASM: input regs, e.g.: "c" (count), "S" (src), "D" (dest)
2953
: "%ebx", "%edx", "%edi", "%esi", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5" // CHECKASM: clobber list
2957
} // end switch ( bpp )
2960
// MMX acceleration complete now do clean-up
2961
// Check if any remaining bytes left to decode
2962
"movl _MMXLength, %%ebx \n\t" // ebx ==> x = offset bytes remaining after MMX
2963
"movl row, %%edi \n\t" // edi ==> Avg(x)
2964
"cmpl _FullLength, %%ebx \n\t" // Test if offset at end of array
2966
// Do Paeth decode for remaining bytes
2967
"movl prev_row, %%esi \n\t" // esi ==> Prior(x)
2968
"movl %%edi, %%edx \n\t"
2969
"xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx in loop below
2970
"subl bpp, %%edx \n\t" // edx ==> Raw(x-bpp)
2972
// Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2973
"xorl %%eax, %%eax \n\t"
2974
"movb (%%esi,%%ebx,), %%cl \n\t" // load cl with Prior(x)
2975
"movb (%%edx,%%ebx,), %%al \n\t" // load al with Raw(x-bpp)
2976
"addw %%cx, %%ax \n\t"
2978
"shrw %%ax \n\t" // divide by 2
2979
"addb -1(%%edi,%%ebx,), %%al \n\t" // Add Avg(x); -1 to offset inc ebx
2980
"cmpl _FullLength, %%ebx \n\t" // Check if at end of array
2981
"movb %%al, -1(%%edi,%%ebx,) \n\t" // Write back Raw(x);
2982
// mov does not affect flags; -1 to offset inc ebx
2985
"emms \n\t" // End MMX instructions; prep for possible FP instrs.
2987
: // FIXASM: output regs/vars go here, e.g.: "=m" (memory_var)
2989
: // FIXASM: input regs, e.g.: "c" (count), "S" (src), "D" (dest)
2991
: "%eax", "%ebx", "%ecx", "%edx", "%edi", "%esi" // CHECKASM: clobber list
2993
#endif /* GRR_GCC_MMX_CONVERTED */
2996
// Optimized code for PNG Paeth filter decoder
2998
png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
3001
#ifdef GRR_GCC_MMX_CONVERTED
3003
int patemp, pbtemp, pctemp;
3005
bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
3006
_FullLength = row_info->rowbytes; // # of bytes to filter
3008
"xorl %%ebx, %%ebx \n\t" // ebx ==> x offset
3009
"movl row, %%edi \n\t"
3010
"xorl %%edx, %%edx \n\t" // edx ==> x-bpp offset
3011
"movl prev_row, %%esi \n\t"
3012
"xorl %%eax, %%eax \n\t"
3014
// Compute the Raw value for the first bpp bytes
3015
// Note: the formula works out to be always
3016
// Paeth(x) = Raw(x) + Prior(x) where x < bpp
3018
"movb (%%edi,%%ebx,), %%al \n\t"
3019
"addb (%%esi,%%ebx,), %%al \n\t"
3021
"cmpl bpp, %%ebx \n\t"
3022
"movb %%al, -1(%%edi,%%ebx,) \n\t"
3024
// get # of bytes to alignment
3025
"movl %%edi, _dif \n\t" // take start of row
3026
"addl %%ebx, _dif \n\t" // add bpp
3027
"xorl %%ecx, %%ecx \n\t"
3028
"addl $0xf, _dif \n\t" // add 7 + 8 to incr past alignment boundary
3029
"andl $0xfffffff8, _dif \n\t" // mask to alignment boundary
3030
"subl %%edi, _dif \n\t" // subtract from start ==> value ebx at alignment
3034
"xorl %%eax, %%eax \n\t"
3035
// pav = p - a = (a + b - c) - a = b - c
3036
"movb (%%esi,%%ebx,), %%al \n\t" // load Prior(x) into al
3037
"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
3038
"subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
3039
"movl %%eax, patemp \n\t" // Save pav for later use
3040
"xorl %%eax, %%eax \n\t"
3041
// pbv = p - b = (a + b - c) - b = a - c
3042
"movb (%%edi,%%edx,), %%al \n\t" // load Raw(x-bpp) into al
3043
"subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
3044
"movl %%eax, %%ecx \n\t"
3045
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3046
"addl patemp, %%eax \n\t" // pcv = pav + pbv
3048
"testl $0x80000000, %%eax \n\t"
3050
"negl %%eax \n\t" // reverse sign of neg values
3052
"movl %%eax, pctemp \n\t" // save pc for later use
3054
"testl $0x80000000, %%ecx \n\t"
3056
"negl %%ecx \n\t" // reverse sign of neg values
3058
"movl %%ecx, pbtemp \n\t" // save pb for later use
3060
"movl patemp, %%eax \n\t"
3061
"testl $0x80000000, %%eax \n\t"
3063
"negl %%eax \n\t" // reverse sign of neg values
3065
"movl %%eax, patemp \n\t" // save pa for later use
3067
"cmpl %%ecx, %%eax \n\t"
3068
"jna paeth_abb \n\t"
3069
// pa > pb; now test if pb <= pc
3070
"cmpl pctemp, %%ecx \n\t"
3071
"jna paeth_bbc \n\t"
3072
// pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3073
"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
3074
"jmp paeth_paeth \n\t"
3076
// pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3077
"movb (%%esi,%%ebx,), %%cl \n\t" // load Prior(x) into cl
3078
"jmp paeth_paeth \n\t"
3080
// pa <= pb; now test if pa <= pc
3081
"cmpl pctemp, %%eax \n\t"
3082
"jna paeth_abc \n\t"
3083
// pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3084
"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
3085
"jmp paeth_paeth \n\t"
3087
// pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3088
"movb (%%edi,%%edx,), %%cl \n\t" // load Raw(x-bpp) into cl
3092
// Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3093
"addb %%cl, -1(%%edi,%%ebx,) \n\t"
3094
"cmpl _dif, %%ebx \n\t"
3097
"movl _FullLength, %%ecx \n\t"
3098
"movl %%ecx, %%eax \n\t"
3099
"subl %%ebx, %%eax \n\t" // subtract alignment fix
3100
"andl $0x00000007, %%eax \n\t" // calc bytes over mult of 8
3101
"subl %%eax, %%ecx \n\t" // drop over bytes from original length
3102
"movl %%ecx, _MMXLength \n\t"
3104
: // FIXASM: output regs/vars go here, e.g.: "=m" (memory_var)
3106
: // FIXASM: input regs, e.g.: "c" (count), "S" (src), "D" (dest)
3108
: "%eax", "%ebx", "%ecx", "%edx", "%edi", "%esi" // CHECKASM: clobber list
3111
// Now do the math for the rest of the row
3116
ActiveMask.use = 0x0000000000ffffff;
3117
ActiveMaskEnd.use = 0xffff000000000000;
3118
ShiftBpp.use = 24; // == bpp(3) * 8
3119
ShiftRem.use = 40; // == 64 - 24
3121
"movl _dif, %%ebx \n\t"
3122
"movl row, %%edi \n\t"
3123
"movl prev_row, %%esi \n\t"
3124
"pxor %%mm0, %%mm0 \n\t"
3125
// PRIME the pump (load the first Raw(x-bpp) data set
3126
"movq -8(%%edi,%%ebx,), %%mm1 \n\t"
3128
"psrlq $ShiftRem, %%mm1 \n\t" // shift last 3 bytes to 1st 3 bytes
3129
"movq (%%esi,%%ebx,), %%mm2 \n\t" // load b=Prior(x)
3130
"punpcklbw %%mm0, %%mm1 \n\t" // Unpack High bytes of a
3131
"movq -8(%%esi,%%ebx,), %%mm3 \n\t" // Prep c=Prior(x-bpp) bytes
3132
"punpcklbw %%mm0, %%mm2 \n\t" // Unpack High bytes of b
3133
"psrlq $ShiftRem, %%mm3 \n\t" // shift last 3 bytes to 1st 3 bytes
3134
// pav = p - a = (a + b - c) - a = b - c
3135
"movq %%mm2, %%mm4 \n\t"
3136
"punpcklbw %%mm0, %%mm3 \n\t" // Unpack High bytes of c
3137
// pbv = p - b = (a + b - c) - b = a - c
3138
"movq %%mm1, %%mm5 \n\t"
3139
"psubw %%mm3, %%mm4 \n\t"
3140
"pxor %%mm7, %%mm7 \n\t"
3141
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3142
"movq %%mm4, %%mm6 \n\t"
3143
"psubw %%mm3, %%mm5 \n\t"
3145
// pa = abs(p-a) = abs(pav)
3146
// pb = abs(p-b) = abs(pbv)
3147
// pc = abs(p-c) = abs(pcv)
3148
"pcmpgtw %%mm4, %%mm0 \n\t" // Create mask pav bytes < 0
3149
"paddw %%mm5, %%mm6 \n\t"
3150
"pand %%mm4, %%mm0 \n\t" // Only pav bytes < 0 in mm7
3151
"pcmpgtw %%mm5, %%mm7 \n\t" // Create mask pbv bytes < 0
3152
"psubw %%mm0, %%mm4 \n\t"
3153
"pand %%mm5, %%mm7 \n\t" // Only pbv bytes < 0 in mm0
3154
"psubw %%mm0, %%mm4 \n\t"
3155
"psubw %%mm7, %%mm5 \n\t"
3156
"pxor %%mm0, %%mm0 \n\t"
3157
"pcmpgtw %%mm6, %%mm0 \n\t" // Create mask pcv bytes < 0
3158
"pand %%mm6, %%mm0 \n\t" // Only pav bytes < 0 in mm7
3159
"psubw %%mm7, %%mm5 \n\t"
3160
"psubw %%mm0, %%mm6 \n\t"
3162
"movq %%mm4, %%mm7 \n\t"
3163
"psubw %%mm0, %%mm6 \n\t"
3164
"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3165
"movq %%mm7, %%mm0 \n\t"
3166
// use mm7 mask to merge pa & pb
3167
"pand %%mm7, %%mm5 \n\t"
3168
// use mm0 mask copy to merge a & b
3169
"pand %%mm0, %%mm2 \n\t"
3170
"pandn %%mm4, %%mm7 \n\t"
3171
"pandn %%mm1, %%mm0 \n\t"
3172
"paddw %%mm5, %%mm7 \n\t"
3173
"paddw %%mm2, %%mm0 \n\t"
3174
// test ((pa <= pb)? pa:pb) <= pc
3175
"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3176
"pxor %%mm1, %%mm1 \n\t"
3177
"pand %%mm7, %%mm3 \n\t"
3178
"pandn %%mm0, %%mm7 \n\t"
3179
"paddw %%mm3, %%mm7 \n\t"
3180
"pxor %%mm0, %%mm0 \n\t"
3181
"packuswb %%mm1, %%mm7 \n\t"
3182
"movq (%%esi,%%ebx,), %%mm3 \n\t" // load c=Prior(x-bpp)
3183
"pand $ActiveMask, %%mm7 \n\t"
3184
"movq %%mm3, %%mm2 \n\t" // load b=Prior(x) step 1
3185
"paddb (%%edi,%%ebx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
3186
"punpcklbw %%mm0, %%mm3 \n\t" // Unpack High bytes of c
3187
"movq %%mm7, (%%edi,%%ebx,) \n\t" // write back updated value
3188
"movq %%mm7, %%mm1 \n\t" // Now mm1 will be used as Raw(x-bpp)
3189
// Now do Paeth for 2nd set of bytes (3-5)
3190
"psrlq $ShiftBpp, %%mm2 \n\t" // load b=Prior(x) step 2
3191
"punpcklbw %%mm0, %%mm1 \n\t" // Unpack High bytes of a
3192
"pxor %%mm7, %%mm7 \n\t"
3193
"punpcklbw %%mm0, %%mm2 \n\t" // Unpack High bytes of b
3194
// pbv = p - b = (a + b - c) - b = a - c
3195
"movq %%mm1, %%mm5 \n\t"
3196
// pav = p - a = (a + b - c) - a = b - c
3197
"movq %%mm2, %%mm4 \n\t"
3198
"psubw %%mm3, %%mm5 \n\t"
3199
"psubw %%mm3, %%mm4 \n\t"
3200
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
3201
// pav + pbv = pbv + pav
3202
"movq %%mm5, %%mm6 \n\t"
3203
"paddw %%mm4, %%mm6 \n\t"
3205
// pa = abs(p-a) = abs(pav)
3206
// pb = abs(p-b) = abs(pbv)
3207
// pc = abs(p-c) = abs(pcv)
3208
"pcmpgtw %%mm5, %%mm0 \n\t" // Create mask pbv bytes < 0
3209
"pcmpgtw %%mm4, %%mm7 \n\t" // Create mask pav bytes < 0
3210
"pand %%mm5, %%mm0 \n\t" // Only pbv bytes < 0 in mm0
3211
"pand %%mm4, %%mm7 \n\t" // Only pav bytes < 0 in mm7
3212
"psubw %%mm0, %%mm5 \n\t"
3213
"psubw %%mm7, %%mm4 \n\t"
3214
"psubw %%mm0, %%mm5 \n\t"
3215
"psubw %%mm7, %%mm4 \n\t"
3216
"pxor %%mm0, %%mm0 \n\t"
3217
"pcmpgtw %%mm6, %%mm0 \n\t" // Create mask pcv bytes < 0
3218
"pand %%mm6, %%mm0 \n\t" // Only pav bytes < 0 in mm7
3219
"psubw %%mm0, %%mm6 \n\t"
3221
"movq %%mm4, %%mm7 \n\t"
3222
"psubw %%mm0, %%mm6 \n\t"
3223
"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3224
"movq %%mm7, %%mm0 \n\t"
3225
// use mm7 mask to merge pa & pb
3226
"pand %%mm7, %%mm5 \n\t"
3227
// use mm0 mask copy to merge a & b
3228
"pand %%mm0, %%mm2 \n\t"
3229
"pandn %%mm4, %%mm7 \n\t"
3230
"pandn %%mm1, %%mm0 \n\t"
3231
"paddw %%mm5, %%mm7 \n\t"
3232
"paddw %%mm2, %%mm0 \n\t"
3233
// test ((pa <= pb)? pa:pb) <= pc
3234
"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3235
"movq (%%esi,%%ebx,), %%mm2 \n\t" // load b=Prior(x)
3236
"pand %%mm7, %%mm3 \n\t"
3237
"pandn %%mm0, %%mm7 \n\t"
3238
"pxor %%mm1, %%mm1 \n\t"
3239
"paddw %%mm3, %%mm7 \n\t"
3240
"pxor %%mm0, %%mm0 \n\t"
3241
"packuswb %%mm1, %%mm7 \n\t"
3242
"movq %%mm2, %%mm3 \n\t" // load c=Prior(x-bpp) step 1
3243
"pand $ActiveMask, %%mm7 \n\t"
3244
"punpckhbw %%mm0, %%mm2 \n\t" // Unpack High bytes of b
3245
"psllq $ShiftBpp, %%mm7 \n\t" // Shift bytes to 2nd group of 3 bytes
3246
// pav = p - a = (a + b - c) - a = b - c
3247
"movq %%mm2, %%mm4 \n\t"
3248
"paddb (%%edi,%%ebx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
3249
"psllq $ShiftBpp, %%mm3 \n\t" // load c=Prior(x-bpp) step 2
3250
"movq %%mm7, (%%edi,%%ebx,) \n\t" // write back updated value
3251
"movq %%mm7, %%mm1 \n\t"
3252
"punpckhbw %%mm0, %%mm3 \n\t" // Unpack High bytes of c
3253
"psllq $ShiftBpp, %%mm1 \n\t" // Shift bytes
3254
// Now mm1 will be used as Raw(x-bpp)
3255
// Now do Paeth for 3rd, and final, set of bytes (6-7)
3256
"pxor %%mm7, %%mm7 \n\t"
3257
"punpckhbw %%mm0, %%mm1 \n\t" // Unpack High bytes of a
3258
"psubw %%mm3, %%mm4 \n\t"
3259
// pbv = p - b = (a + b - c) - b = a - c
3260
"movq %%mm1, %%mm5 \n\t"
3261
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3262
"movq %%mm4, %%mm6 \n\t"
3263
"psubw %%mm3, %%mm5 \n\t"
3264
"pxor %%mm0, %%mm0 \n\t"
3265
"paddw %%mm5, %%mm6 \n\t"
3267
// pa = abs(p-a) = abs(pav)
3268
// pb = abs(p-b) = abs(pbv)
3269
// pc = abs(p-c) = abs(pcv)
3270
"pcmpgtw %%mm4, %%mm0 \n\t" // Create mask pav bytes < 0
3271
"pcmpgtw %%mm5, %%mm7 \n\t" // Create mask pbv bytes < 0
3272
"pand %%mm4, %%mm0 \n\t" // Only pav bytes < 0 in mm7
3273
"pand %%mm5, %%mm7 \n\t" // Only pbv bytes < 0 in mm0
3274
"psubw %%mm0, %%mm4 \n\t"
3275
"psubw %%mm7, %%mm5 \n\t"
3276
"psubw %%mm0, %%mm4 \n\t"
3277
"psubw %%mm7, %%mm5 \n\t"
3278
"pxor %%mm0, %%mm0 \n\t"
3279
"pcmpgtw %%mm6, %%mm0 \n\t" // Create mask pcv bytes < 0
3280
"pand %%mm6, %%mm0 \n\t" // Only pav bytes < 0 in mm7
3281
"psubw %%mm0, %%mm6 \n\t"
3283
"movq %%mm4, %%mm7 \n\t"
3284
"psubw %%mm0, %%mm6 \n\t"
3285
"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3286
"movq %%mm7, %%mm0 \n\t"
3287
// use mm0 mask copy to merge a & b
3288
"pand %%mm0, %%mm2 \n\t"
3289
// use mm7 mask to merge pa & pb
3290
"pand %%mm7, %%mm5 \n\t"
3291
"pandn %%mm1, %%mm0 \n\t"
3292
"pandn %%mm4, %%mm7 \n\t"
3293
"paddw %%mm2, %%mm0 \n\t"
3294
"paddw %%mm5, %%mm7 \n\t"
3295
// test ((pa <= pb)? pa:pb) <= pc
3296
"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3297
"pand %%mm7, %%mm3 \n\t"
3298
"pandn %%mm0, %%mm7 \n\t"
3299
"paddw %%mm3, %%mm7 \n\t"
3300
"pxor %%mm1, %%mm1 \n\t"
3301
"packuswb %%mm7, %%mm1 \n\t"
3302
// Step ebx to next set of 8 bytes and repeat loop til done
3303
"addl $8, %%ebx \n\t"
3304
"pand $ActiveMaskEnd, %%mm1 \n\t"
3305
"paddb -8(%%edi,%%ebx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
3307
"cmpl _MMXLength, %%ebx \n\t"
3308
"pxor %%mm0, %%mm0 \n\t" // pxor does not affect flags
3309
"movq %%mm1, -8(%%edi,%%ebx,) \n\t" // write back updated value
3310
// mm1 will be used as Raw(x-bpp) next loop
3311
// mm3 ready to be used as Prior(x-bpp) next loop
3314
: // FIXASM: output regs/vars go here, e.g.: "=m" (memory_var)
3316
: // FIXASM: input regs, e.g.: "c" (count), "S" (src), "D" (dest)
3318
: "%ebx", "%edi", "%esi", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" // CHECKASM: clobber list
3324
//case 7: // GRR BOGUS
3325
//case 5: // GRR BOGUS
3327
ActiveMask.use = 0x00000000ffffffff;
3328
ActiveMask2.use = 0xffffffff00000000;
3329
ShiftBpp.use = bpp << 3; // == bpp * 8
3330
ShiftRem.use = 64 - ShiftBpp.use;
3332
"movl _dif, %%ebx \n\t"
3333
"movl row, %%edi \n\t"
3334
"movl prev_row, %%esi \n\t"
3335
// PRIME the pump (load the first Raw(x-bpp) data set
3336
"movq -8(%%edi,%%ebx,), %%mm1 \n\t"
3337
"pxor %%mm0, %%mm0 \n\t"
3339
// Must shift to position Raw(x-bpp) data
3340
"psrlq $ShiftRem, %%mm1 \n\t"
3341
// Do first set of 4 bytes
3342
"movq -8(%%esi,%%ebx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
3343
"punpcklbw %%mm0, %%mm1 \n\t" // Unpack Low bytes of a
3344
"movq (%%esi,%%ebx,), %%mm2 \n\t" // load b=Prior(x)
3345
"punpcklbw %%mm0, %%mm2 \n\t" // Unpack Low bytes of b
3346
// Must shift to position Prior(x-bpp) data
3347
"psrlq $ShiftRem, %%mm3 \n\t"
3348
// pav = p - a = (a + b - c) - a = b - c
3349
"movq %%mm2, %%mm4 \n\t"
3350
"punpcklbw %%mm0, %%mm3 \n\t" // Unpack Low bytes of c
3351
// pbv = p - b = (a + b - c) - b = a - c
3352
"movq %%mm1, %%mm5 \n\t"
3353
"psubw %%mm3, %%mm4 \n\t"
3354
"pxor %%mm7, %%mm7 \n\t"
3355
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3356
"movq %%mm4, %%mm6 \n\t"
3357
"psubw %%mm3, %%mm5 \n\t"
3358
// pa = abs(p-a) = abs(pav)
3359
// pb = abs(p-b) = abs(pbv)
3360
// pc = abs(p-c) = abs(pcv)
3361
"pcmpgtw %%mm4, %%mm0 \n\t" // Create mask pav bytes < 0
3362
"paddw %%mm5, %%mm6 \n\t"
3363
"pand %%mm4, %%mm0 \n\t" // Only pav bytes < 0 in mm7
3364
"pcmpgtw %%mm5, %%mm7 \n\t" // Create mask pbv bytes < 0
3365
"psubw %%mm0, %%mm4 \n\t"
3366
"pand %%mm5, %%mm7 \n\t" // Only pbv bytes < 0 in mm0
3367
"psubw %%mm0, %%mm4 \n\t"
3368
"psubw %%mm7, %%mm5 \n\t"
3369
"pxor %%mm0, %%mm0 \n\t"
3370
"pcmpgtw %%mm6, %%mm0 \n\t" // Create mask pcv bytes < 0
3371
"pand %%mm6, %%mm0 \n\t" // Only pav bytes < 0 in mm7
3372
"psubw %%mm7, %%mm5 \n\t"
3373
"psubw %%mm0, %%mm6 \n\t"
3375
"movq %%mm4, %%mm7 \n\t"
3376
"psubw %%mm0, %%mm6 \n\t"
3377
"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3378
"movq %%mm7, %%mm0 \n\t"
3379
// use mm7 mask to merge pa & pb
3380
"pand %%mm7, %%mm5 \n\t"
3381
// use mm0 mask copy to merge a & b
3382
"pand %%mm0, %%mm2 \n\t"
3383
"pandn %%mm4, %%mm7 \n\t"
3384
"pandn %%mm1, %%mm0 \n\t"
3385
"paddw %%mm5, %%mm7 \n\t"
3386
"paddw %%mm2, %%mm0 \n\t"
3387
// test ((pa <= pb)? pa:pb) <= pc
3388
"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3389
"pxor %%mm1, %%mm1 \n\t"
3390
"pand %%mm7, %%mm3 \n\t"
3391
"pandn %%mm0, %%mm7 \n\t"
3392
"paddw %%mm3, %%mm7 \n\t"
3393
"pxor %%mm0, %%mm0 \n\t"
3394
"packuswb %%mm1, %%mm7 \n\t"
3395
"movq -8(%%esi,%%ebx,), %%mm3 \n\t" // load c=Prior(x-bpp)
3396
"pand $ActiveMask, %%mm7 \n\t"
3397
"psrlq $ShiftRem, %%mm3 \n\t"
3398
"movq (%%esi,%%ebx,), %%mm2 \n\t" // load b=Prior(x) step 1
3399
"paddb (%%edi,%%ebx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
3400
"movq %%mm2, %%mm6 \n\t"
3401
"movq %%mm7, (%%edi,%%ebx,) \n\t" // write back updated value
3402
"movq -8(%%edi,%%ebx,), %%mm1 \n\t"
3403
"psllq $ShiftBpp, %%mm6 \n\t"
3404
"movq %%mm7, %%mm5 \n\t"
3405
"psrlq $ShiftRem, %%mm1 \n\t"
3406
"por %%mm6, %%mm3 \n\t"
3407
"psllq $ShiftBpp, %%mm5 \n\t"
3408
"punpckhbw %%mm0, %%mm3 \n\t" // Unpack High bytes of c
3409
"por %%mm5, %%mm1 \n\t"
3410
// Do second set of 4 bytes
3411
"punpckhbw %%mm0, %%mm2 \n\t" // Unpack High bytes of b
3412
"punpckhbw %%mm0, %%mm1 \n\t" // Unpack High bytes of a
3413
// pav = p - a = (a + b - c) - a = b - c
3414
"movq %%mm2, %%mm4 \n\t"
3415
// pbv = p - b = (a + b - c) - b = a - c
3416
"movq %%mm1, %%mm5 \n\t"
3417
"psubw %%mm3, %%mm4 \n\t"
3418
"pxor %%mm7, %%mm7 \n\t"
3419
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3420
"movq %%mm4, %%mm6 \n\t"
3421
"psubw %%mm3, %%mm5 \n\t"
3422
// pa = abs(p-a) = abs(pav)
3423
// pb = abs(p-b) = abs(pbv)
3424
// pc = abs(p-c) = abs(pcv)
3425
"pcmpgtw %%mm4, %%mm0 \n\t" // Create mask pav bytes < 0
3426
"paddw %%mm5, %%mm6 \n\t"
3427
"pand %%mm4, %%mm0 \n\t" // Only pav bytes < 0 in mm7
3428
"pcmpgtw %%mm5, %%mm7 \n\t" // Create mask pbv bytes < 0
3429
"psubw %%mm0, %%mm4 \n\t"
3430
"pand %%mm5, %%mm7 \n\t" // Only pbv bytes < 0 in mm0
3431
"psubw %%mm0, %%mm4 \n\t"
3432
"psubw %%mm7, %%mm5 \n\t"
3433
"pxor %%mm0, %%mm0 \n\t"
3434
"pcmpgtw %%mm6, %%mm0 \n\t" // Create mask pcv bytes < 0
3435
"pand %%mm6, %%mm0 \n\t" // Only pav bytes < 0 in mm7
3436
"psubw %%mm7, %%mm5 \n\t"
3437
"psubw %%mm0, %%mm6 \n\t"
3439
"movq %%mm4, %%mm7 \n\t"
3440
"psubw %%mm0, %%mm6 \n\t"
3441
"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3442
"movq %%mm7, %%mm0 \n\t"
3443
// use mm7 mask to merge pa & pb
3444
"pand %%mm7, %%mm5 \n\t"
3445
// use mm0 mask copy to merge a & b
3446
"pand %%mm0, %%mm2 \n\t"
3447
"pandn %%mm4, %%mm7 \n\t"
3448
"pandn %%mm1, %%mm0 \n\t"
3449
"paddw %%mm5, %%mm7 \n\t"
3450
"paddw %%mm2, %%mm0 \n\t"
3451
// test ((pa <= pb)? pa:pb) <= pc
3452
"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3453
"pxor %%mm1, %%mm1 \n\t"
3454
"pand %%mm7, %%mm3 \n\t"
3455
"pandn %%mm0, %%mm7 \n\t"
3456
"pxor %%mm1, %%mm1 \n\t"
3457
"paddw %%mm3, %%mm7 \n\t"
3458
"pxor %%mm0, %%mm0 \n\t"
3459
// Step ex to next set of 8 bytes and repeat loop til done
3460
"addl $8, %%ebx \n\t"
3461
"packuswb %%mm7, %%mm1 \n\t"
3462
"paddb -8(%%edi,%%ebx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
3463
"cmpl _MMXLength, %%ebx \n\t"
3464
"movq %%mm1, -8(%%edi,%%ebx,) \n\t" // write back updated value
3465
// mm1 will be used as Raw(x-bpp) next loop
3468
: // FIXASM: output regs/vars go here, e.g.: "=m" (memory_var)
3470
: // FIXASM: input regs, e.g.: "c" (count), "S" (src), "D" (dest)
3472
: "%ebx", "%edi", "%esi", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" // CHECKASM: clobber list
3479
ActiveMask.use = 0x00000000ffffffff;
3481
"movl _dif, %%ebx \n\t"
3482
"movl row, %%edi \n\t"
3483
"movl prev_row, %%esi \n\t"
3484
"pxor %%mm0, %%mm0 \n\t"
3485
// PRIME the pump (load the first Raw(x-bpp) data set
3486
"movq -8(%%edi,%%ebx,), %%mm1 \n\t" // Only time should need to read
3487
// a=Raw(x-bpp) bytes
3489
// Do first set of 4 bytes
3490
"movq -8(%%esi,%%ebx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
3491
"punpckhbw %%mm0, %%mm1 \n\t" // Unpack Low bytes of a
3492
"movq (%%esi,%%ebx,), %%mm2 \n\t" // load b=Prior(x)
3493
"punpcklbw %%mm0, %%mm2 \n\t" // Unpack High bytes of b
3494
// pav = p - a = (a + b - c) - a = b - c
3495
"movq %%mm2, %%mm4 \n\t"
3496
"punpckhbw %%mm0, %%mm3 \n\t" // Unpack High bytes of c
3497
// pbv = p - b = (a + b - c) - b = a - c
3498
"movq %%mm1, %%mm5 \n\t"
3499
"psubw %%mm3, %%mm4 \n\t"
3500
"pxor %%mm7, %%mm7 \n\t"
3501
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3502
"movq %%mm4, %%mm6 \n\t"
3503
"psubw %%mm3, %%mm5 \n\t"
3504
// pa = abs(p-a) = abs(pav)
3505
// pb = abs(p-b) = abs(pbv)
3506
// pc = abs(p-c) = abs(pcv)
3507
"pcmpgtw %%mm4, %%mm0 \n\t" // Create mask pav bytes < 0
3508
"paddw %%mm5, %%mm6 \n\t"
3509
"pand %%mm4, %%mm0 \n\t" // Only pav bytes < 0 in mm7
3510
"pcmpgtw %%mm5, %%mm7 \n\t" // Create mask pbv bytes < 0
3511
"psubw %%mm0, %%mm4 \n\t"
3512
"pand %%mm5, %%mm7 \n\t" // Only pbv bytes < 0 in mm0
3513
"psubw %%mm0, %%mm4 \n\t"
3514
"psubw %%mm7, %%mm5 \n\t"
3515
"pxor %%mm0, %%mm0 \n\t"
3516
"pcmpgtw %%mm6, %%mm0 \n\t" // Create mask pcv bytes < 0
3517
"pand %%mm6, %%mm0 \n\t" // Only pav bytes < 0 in mm7
3518
"psubw %%mm7, %%mm5 \n\t"
3519
"psubw %%mm0, %%mm6 \n\t"
3521
"movq %%mm4, %%mm7 \n\t"
3522
"psubw %%mm0, %%mm6 \n\t"
3523
"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3524
"movq %%mm7, %%mm0 \n\t"
3525
// use mm7 mask to merge pa & pb
3526
"pand %%mm7, %%mm5 \n\t"
3527
// use mm0 mask copy to merge a & b
3528
"pand %%mm0, %%mm2 \n\t"
3529
"pandn %%mm4, %%mm7 \n\t"
3530
"pandn %%mm1, %%mm0 \n\t"
3531
"paddw %%mm5, %%mm7 \n\t"
3532
"paddw %%mm2, %%mm0 \n\t"
3533
// test ((pa <= pb)? pa:pb) <= pc
3534
"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3535
"pxor %%mm1, %%mm1 \n\t"
3536
"pand %%mm7, %%mm3 \n\t"
3537
"pandn %%mm0, %%mm7 \n\t"
3538
"paddw %%mm3, %%mm7 \n\t"
3539
"pxor %%mm0, %%mm0 \n\t"
3540
"packuswb %%mm1, %%mm7 \n\t"
3541
"movq (%%esi,%%ebx,), %%mm3 \n\t" // load c=Prior(x-bpp)
3542
"pand $ActiveMask, %%mm7 \n\t"
3543
"movq %%mm3, %%mm2 \n\t" // load b=Prior(x) step 1
3544
"paddb (%%edi,%%ebx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
3545
"punpcklbw %%mm0, %%mm3 \n\t" // Unpack High bytes of c
3546
"movq %%mm7, (%%edi,%%ebx,) \n\t" // write back updated value
3547
"movq %%mm7, %%mm1 \n\t" // Now mm1 will be used as Raw(x-bpp)
3548
// Do second set of 4 bytes
3549
"punpckhbw %%mm0, %%mm2 \n\t" // Unpack Low bytes of b
3550
"punpcklbw %%mm0, %%mm1 \n\t" // Unpack Low bytes of a
3551
// pav = p - a = (a + b - c) - a = b - c
3552
"movq %%mm2, %%mm4 \n\t"
3553
// pbv = p - b = (a + b - c) - b = a - c
3554
"movq %%mm1, %%mm5 \n\t"
3555
"psubw %%mm3, %%mm4 \n\t"
3556
"pxor %%mm7, %%mm7 \n\t"
3557
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3558
"movq %%mm4, %%mm6 \n\t"
3559
"psubw %%mm3, %%mm5 \n\t"
3560
// pa = abs(p-a) = abs(pav)
3561
// pb = abs(p-b) = abs(pbv)
3562
// pc = abs(p-c) = abs(pcv)
3563
"pcmpgtw %%mm4, %%mm0 \n\t" // Create mask pav bytes < 0
3564
"paddw %%mm5, %%mm6 \n\t"
3565
"pand %%mm4, %%mm0 \n\t" // Only pav bytes < 0 in mm7
3566
"pcmpgtw %%mm5, %%mm7 \n\t" // Create mask pbv bytes < 0
3567
"psubw %%mm0, %%mm4 \n\t"
3568
"pand %%mm5, %%mm7 \n\t" // Only pbv bytes < 0 in mm0
3569
"psubw %%mm0, %%mm4 \n\t"
3570
"psubw %%mm7, %%mm5 \n\t"
3571
"pxor %%mm0, %%mm0 \n\t"
3572
"pcmpgtw %%mm6, %%mm0 \n\t" // Create mask pcv bytes < 0
3573
"pand %%mm6, %%mm0 \n\t" // Only pav bytes < 0 in mm7
3574
"psubw %%mm7, %%mm5 \n\t"
3575
"psubw %%mm0, %%mm6 \n\t"
3577
"movq %%mm4, %%mm7 \n\t"
3578
"psubw %%mm0, %%mm6 \n\t"
3579
"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3580
"movq %%mm7, %%mm0 \n\t"
3581
// use mm7 mask to merge pa & pb
3582
"pand %%mm7, %%mm5 \n\t"
3583
// use mm0 mask copy to merge a & b
3584
"pand %%mm0, %%mm2 \n\t"
3585
"pandn %%mm4, %%mm7 \n\t"
3586
"pandn %%mm1, %%mm0 \n\t"
3587
"paddw %%mm5, %%mm7 \n\t"
3588
"paddw %%mm2, %%mm0 \n\t"
3589
// test ((pa <= pb)? pa:pb) <= pc
3590
"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3591
"pxor %%mm1, %%mm1 \n\t"
3592
"pand %%mm7, %%mm3 \n\t"
3593
"pandn %%mm0, %%mm7 \n\t"
3594
"pxor %%mm1, %%mm1 \n\t"
3595
"paddw %%mm3, %%mm7 \n\t"
3596
"pxor %%mm0, %%mm0 \n\t"
3597
// Step ex to next set of 8 bytes and repeat loop til done
3598
"addl $8, %%ebx \n\t"
3599
"packuswb %%mm7, %%mm1 \n\t"
3600
"paddb -8(%%edi,%%ebx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
3601
"cmpl _MMXLength, %%ebx \n\t"
3602
"movq %%mm1, -8(%%edi,%%ebx,) \n\t" // write back updated value
3603
// mm1 will be used as Raw(x-bpp) next loop
3606
: // FIXASM: output regs/vars go here, e.g.: "=m" (memory_var)
3608
: // FIXASM: input regs, e.g.: "c" (count), "S" (src), "D" (dest)
3610
: "%ebx", "%edi", "%esi", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" // CHECKASM: clobber list
3616
ActiveMask.use = 0x00000000ffffffff;
3618
"movl _dif, %%ebx \n\t"
3619
"movl row, %%edi \n\t"
3620
"movl prev_row, %%esi \n\t"
3621
"pxor %%mm0, %%mm0 \n\t"
3622
// PRIME the pump (load the first Raw(x-bpp) data set
3623
"movq -8(%%edi,%%ebx,), %%mm1 \n\t" // Only time should need to read
3624
// a=Raw(x-bpp) bytes
3626
// Do first set of 4 bytes
3627
"movq -8(%%esi,%%ebx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
3628
"punpcklbw %%mm0, %%mm1 \n\t" // Unpack Low bytes of a
3629
"movq (%%esi,%%ebx,), %%mm2 \n\t" // load b=Prior(x)
3630
"punpcklbw %%mm0, %%mm2 \n\t" // Unpack Low bytes of b
3631
// pav = p - a = (a + b - c) - a = b - c
3632
"movq %%mm2, %%mm4 \n\t"
3633
"punpcklbw %%mm0, %%mm3 \n\t" // Unpack Low bytes of c
3634
// pbv = p - b = (a + b - c) - b = a - c
3635
"movq %%mm1, %%mm5 \n\t"
3636
"psubw %%mm3, %%mm4 \n\t"
3637
"pxor %%mm7, %%mm7 \n\t"
3638
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3639
"movq %%mm4, %%mm6 \n\t"
3640
"psubw %%mm3, %%mm5 \n\t"
3641
// pa = abs(p-a) = abs(pav)
3642
// pb = abs(p-b) = abs(pbv)
3643
// pc = abs(p-c) = abs(pcv)
3644
"pcmpgtw %%mm4, %%mm0 \n\t" // Create mask pav bytes < 0
3645
"paddw %%mm5, %%mm6 \n\t"
3646
"pand %%mm4, %%mm0 \n\t" // Only pav bytes < 0 in mm7
3647
"pcmpgtw %%mm5, %%mm7 \n\t" // Create mask pbv bytes < 0
3648
"psubw %%mm0, %%mm4 \n\t"
3649
"pand %%mm5, %%mm7 \n\t" // Only pbv bytes < 0 in mm0
3650
"psubw %%mm0, %%mm4 \n\t"
3651
"psubw %%mm7, %%mm5 \n\t"
3652
"pxor %%mm0, %%mm0 \n\t"
3653
"pcmpgtw %%mm6, %%mm0 \n\t" // Create mask pcv bytes < 0
3654
"pand %%mm6, %%mm0 \n\t" // Only pav bytes < 0 in mm7
3655
"psubw %%mm7, %%mm5 \n\t"
3656
"psubw %%mm0, %%mm6 \n\t"
3658
"movq %%mm4, %%mm7 \n\t"
3659
"psubw %%mm0, %%mm6 \n\t"
3660
"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3661
"movq %%mm7, %%mm0 \n\t"
3662
// use mm7 mask to merge pa & pb
3663
"pand %%mm7, %%mm5 \n\t"
3664
// use mm0 mask copy to merge a & b
3665
"pand %%mm0, %%mm2 \n\t"
3666
"pandn %%mm4, %%mm7 \n\t"
3667
"pandn %%mm1, %%mm0 \n\t"
3668
"paddw %%mm5, %%mm7 \n\t"
3669
"paddw %%mm2, %%mm0 \n\t"
3670
// test ((pa <= pb)? pa:pb) <= pc
3671
"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3672
"pxor %%mm1, %%mm1 \n\t"
3673
"pand %%mm7, %%mm3 \n\t"
3674
"pandn %%mm0, %%mm7 \n\t"
3675
"paddw %%mm3, %%mm7 \n\t"
3676
"pxor %%mm0, %%mm0 \n\t"
3677
"packuswb %%mm1, %%mm7 \n\t"
3678
"movq -8(%%esi,%%ebx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
3679
"pand $ActiveMask, %%mm7 \n\t"
3680
"movq (%%esi,%%ebx,), %%mm2 \n\t" // load b=Prior(x)
3681
"paddb (%%edi,%%ebx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
3682
"punpckhbw %%mm0, %%mm3 \n\t" // Unpack High bytes of c
3683
"movq %%mm7, (%%edi,%%ebx,) \n\t" // write back updated value
3684
"movq -8(%%edi,%%ebx,), %%mm1 \n\t" // read a=Raw(x-bpp) bytes
3686
// Do second set of 4 bytes
3687
"punpckhbw %%mm0, %%mm2 \n\t" // Unpack High bytes of b
3688
"punpckhbw %%mm0, %%mm1 \n\t" // Unpack High bytes of a
3689
// pav = p - a = (a + b - c) - a = b - c
3690
"movq %%mm2, %%mm4 \n\t"
3691
// pbv = p - b = (a + b - c) - b = a - c
3692
"movq %%mm1, %%mm5 \n\t"
3693
"psubw %%mm3, %%mm4 \n\t"
3694
"pxor %%mm7, %%mm7 \n\t"
3695
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3696
"movq %%mm4, %%mm6 \n\t"
3697
"psubw %%mm3, %%mm5 \n\t"
3698
// pa = abs(p-a) = abs(pav)
3699
// pb = abs(p-b) = abs(pbv)
3700
// pc = abs(p-c) = abs(pcv)
3701
"pcmpgtw %%mm4, %%mm0 \n\t" // Create mask pav bytes < 0
3702
"paddw %%mm5, %%mm6 \n\t"
3703
"pand %%mm4, %%mm0 \n\t" // Only pav bytes < 0 in mm7
3704
"pcmpgtw %%mm5, %%mm7 \n\t" // Create mask pbv bytes < 0
3705
"psubw %%mm0, %%mm4 \n\t"
3706
"pand %%mm5, %%mm7 \n\t" // Only pbv bytes < 0 in mm0
3707
"psubw %%mm0, %%mm4 \n\t"
3708
"psubw %%mm7, %%mm5 \n\t"
3709
"pxor %%mm0, %%mm0 \n\t"
3710
"pcmpgtw %%mm6, %%mm0 \n\t" // Create mask pcv bytes < 0
3711
"pand %%mm6, %%mm0 \n\t" // Only pav bytes < 0 in mm7
3712
"psubw %%mm7, %%mm5 \n\t"
3713
"psubw %%mm0, %%mm6 \n\t"
3715
"movq %%mm4, %%mm7 \n\t"
3716
"psubw %%mm0, %%mm6 \n\t"
3717
"pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3718
"movq %%mm7, %%mm0 \n\t"
3719
// use mm7 mask to merge pa & pb
3720
"pand %%mm7, %%mm5 \n\t"
3721
// use mm0 mask copy to merge a & b
3722
"pand %%mm0, %%mm2 \n\t"
3723
"pandn %%mm4, %%mm7 \n\t"
3724
"pandn %%mm1, %%mm0 \n\t"
3725
"paddw %%mm5, %%mm7 \n\t"
3726
"paddw %%mm2, %%mm0 \n\t"
3727
// test ((pa <= pb)? pa:pb) <= pc
3728
"pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3729
"pxor %%mm1, %%mm1 \n\t"
3730
"pand %%mm7, %%mm3 \n\t"
3731
"pandn %%mm0, %%mm7 \n\t"
3732
"pxor %%mm1, %%mm1 \n\t"
3733
"paddw %%mm3, %%mm7 \n\t"
3734
"pxor %%mm0, %%mm0 \n\t"
3735
// Step ex to next set of 8 bytes and repeat loop til done
3736
"addl $8, %%ebx \n\t"
3737
"packuswb %%mm7, %%mm1 \n\t"
3738
"paddb -8(%%edi,%%ebx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
3739
"cmpl _MMXLength, %%ebx \n\t"
3740
"movq %%mm1, -8(%%edi,%%ebx,) \n\t" // write back updated value
3741
// mm1 will be used as Raw(x-bpp) next loop
3744
: // FIXASM: output regs/vars go here, e.g.: "=m" (memory_var)
3746
: // FIXASM: input regs, e.g.: "c" (count), "S" (src), "D" (dest)
3748
: "%ebx", "%edi", "%esi", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" // CHECKASM: clobber list
3758
"movl _dif, %%ebx \n\t"
3759
"cmpl _FullLength, %%ebx \n\t"
3760
"jnb paeth_dend \n\t"
3761
"movl row, %%edi \n\t"
3762
"movl prev_row, %%esi \n\t"
3763
// Do Paeth decode for remaining bytes
3764
"movl %%ebx, %%edx \n\t"
3765
"xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx in loop below
3766
"subl bpp, %%edx \n\t" // Set edx = ebx - bpp
3768
"xorl %%eax, %%eax \n\t"
3769
// pav = p - a = (a + b - c) - a = b - c
3770
"movb (%%esi,%%ebx,), %%al \n\t" // load Prior(x) into al
3771
"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
3772
"subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
3773
"movl %%eax, patemp \n\t" // Save pav for later use
3774
"xorl %%eax, %%eax \n\t"
3775
// pbv = p - b = (a + b - c) - b = a - c
3776
"movb (%%edi,%%edx,), %%al \n\t" // load Raw(x-bpp) into al
3777
"subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
3778
"movl %%eax, %%ecx \n\t"
3779
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3780
"addl patemp, %%eax \n\t" // pcv = pav + pbv
3782
"testl $0x80000000, %%eax \n\t"
3783
"jz paeth_dpca \n\t"
3784
"negl %%eax \n\t" // reverse sign of neg values
3786
"movl %%eax, pctemp \n\t" // save pc for later use
3788
"testl $0x80000000, %%ecx \n\t"
3789
"jz paeth_dpba \n\t"
3790
"negl %%ecx \n\t" // reverse sign of neg values
3792
"movl %%ecx, pbtemp \n\t" // save pb for later use
3794
"movl patemp, %%eax \n\t"
3795
"testl $0x80000000, %%eax \n\t"
3796
"jz paeth_dpaa \n\t"
3797
"negl %%eax \n\t" // reverse sign of neg values
3799
"movl %%eax, patemp \n\t" // save pa for later use
3801
"cmpl %%ecx, %%eax \n\t"
3802
"jna paeth_dabb \n\t"
3803
// pa > pb; now test if pb <= pc
3804
"cmpl pctemp, %%ecx \n\t"
3805
"jna paeth_dbbc \n\t"
3806
// pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3807
"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
3808
"jmp paeth_dpaeth \n\t"
3810
// pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3811
"movb (%%esi,%%ebx,), %%cl \n\t" // load Prior(x) into cl
3812
"jmp paeth_dpaeth \n\t"
3814
// pa <= pb; now test if pa <= pc
3815
"cmpl pctemp, %%eax \n\t"
3816
"jna paeth_dabc \n\t"
3817
// pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3818
"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
3819
"jmp paeth_dpaeth \n\t"
3821
// pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3822
"movb (%%edi,%%edx,), %%cl \n\t" // load Raw(x-bpp) into cl
3823
"paeth_dpaeth: \n\t"
3826
// Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3827
"addb %%cl, -1(%%edi,%%ebx,) \n\t"
3828
"cmpl _FullLength, %%ebx \n\t"
3832
: // FIXASM: output regs/vars go here, e.g.: "=m" (memory_var)
3834
: // FIXASM: input regs, e.g.: "c" (count), "S" (src), "D" (dest)
3836
: "%eax", "%ebx", "%ecx", "%edx", "%edi", "%esi" // CHECKASM: clobber list
3839
return; // No need to go further with this one
3840
} // end switch ( bpp )
3842
// MMX acceleration complete now do clean-up
3843
// Check if any remaining bytes left to decode
3844
"movl _MMXLength, %%ebx \n\t"
3845
"cmpl _FullLength, %%ebx \n\t"
3846
"jnb paeth_end \n\t"
3847
"movl row, %%edi \n\t"
3848
"movl prev_row, %%esi \n\t"
3849
// Do Paeth decode for remaining bytes
3850
"movl %%ebx, %%edx \n\t"
3851
"xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx in loop below
3852
"subl bpp, %%edx \n\t" // Set edx = ebx - bpp
3854
"xorl %%eax, %%eax \n\t"
3855
// pav = p - a = (a + b - c) - a = b - c
3856
"movb (%%esi,%%ebx,), %%al \n\t" // load Prior(x) into al
3857
"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
3858
"subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
3859
"movl %%eax, patemp \n\t" // Save pav for later use
3860
"xorl %%eax, %%eax \n\t"
3861
// pbv = p - b = (a + b - c) - b = a - c
3862
"movb (%%edi,%%edx,), %%al \n\t" // load Raw(x-bpp) into al
3863
"subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
3864
"movl %%eax, %%ecx \n\t"
3865
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3866
"addl patemp, %%eax \n\t" // pcv = pav + pbv
3868
"testl $0x80000000, %%eax \n\t"
3869
"jz paeth_pca2 \n\t"
3870
"negl %%eax \n\t" // reverse sign of neg values
3872
"movl %%eax, pctemp \n\t" // save pc for later use
3874
"testl $0x80000000, %%ecx \n\t"
3875
"jz paeth_pba2 \n\t"
3876
"negl %%ecx \n\t" // reverse sign of neg values
3878
"movl %%ecx, pbtemp \n\t" // save pb for later use
3880
"movl patemp, %%eax \n\t"
3881
"testl $0x80000000, %%eax \n\t"
3882
"jz paeth_paa2 \n\t"
3883
"negl %%eax \n\t" // reverse sign of neg values
3885
"movl %%eax, patemp \n\t" // save pa for later use
3887
"cmpl %%ecx, %%eax \n\t"
3888
"jna paeth_abb2 \n\t"
3889
// pa > pb; now test if pb <= pc
3890
"cmpl pctemp, %%ecx \n\t"
3891
"jna paeth_bbc2 \n\t"
3892
// pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3893
"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
3894
"jmp paeth_paeth2 \n\t"
3896
// pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3897
"movb (%%esi,%%ebx,), %%cl \n\t" // load Prior(x) into cl
3898
"jmp paeth_paeth2 \n\t"
3900
// pa <= pb; now test if pa <= pc
3901
"cmpl pctemp, %%eax \n\t"
3902
"jna paeth_abc2 \n\t"
3903
// pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3904
"movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
3905
"jmp paeth_paeth2 \n\t"
3907
// pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3908
"movb (%%edi,%%edx,), %%cl \n\t" // load Raw(x-bpp) into cl
3909
"paeth_paeth2: \n\t"
3912
// Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3913
"addb %%cl, -1(%%edi,%%ebx,) \n\t"
3914
"cmpl _FullLength, %%ebx \n\t"
3917
"emms \n\t" // End MMX instructions; prep for possible FP instrs.
3919
: // FIXASM: output regs/vars go here, e.g.: "=m" (memory_var)
3921
: // FIXASM: input regs, e.g.: "c" (count), "S" (src), "D" (dest)
3923
: "%eax", "%ebx", "%ecx", "%edx", "%edi", "%esi" // CHECKASM: clobber list
3925
#endif /* GRR_GCC_MMX_CONVERTED */
3928
// Optimized code for PNG Sub filter decoder
3930
png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
3932
#ifdef GRR_GCC_MMX_CONVERTED
3935
bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
3936
_FullLength = row_info->rowbytes - bpp; // # of bytes to filter
3938
"movl row, %%edi \n\t"
3939
"movl %%edi, %%esi \n\t" // lp = row
3940
"addl bpp, %%edi \n\t" // rp = row + bpp
3941
"xorl %%eax, %%eax \n\t"
3942
// get # of bytes to alignment
3943
"movl %%edi, _dif \n\t" // take start of row
3944
"addl $0xf, _dif \n\t" // add 7 + 8 to incr past
3945
// alignment boundary
3946
"xorl %%ebx, %%ebx \n\t"
3947
"andl $0xfffffff8, _dif \n\t" // mask to alignment boundary
3948
"subl %%edi, _dif \n\t" // subtract from start ==> value
3953
"movb (%%esi,%%ebx,), %%al \n\t"
3954
"addb %%al, (%%edi,%%ebx,) \n\t"
3956
"cmpl _dif, %%ebx \n\t"
3959
"movl _FullLength, %%ecx \n\t"
3960
"movl %%ecx, %%edx \n\t"
3961
"subl %%ebx, %%edx \n\t" // subtract alignment fix
3962
"andl $0x00000007, %%edx \n\t" // calc bytes over mult of 8
3963
"subl %%edx, %%ecx \n\t" // drop over bytes from length
3964
"movl %%ecx, _MMXLength \n\t"
3966
: // FIXASM: output regs/vars go here, e.g.: "=m" (memory_var)
3968
: // FIXASM: input regs, e.g.: "c" (count), "S" (src), "D" (dest)
3970
: "%eax", "%ebx", "%ecx", "%edx", "%edi", "%esi" // CHECKASM: clobber list
3973
// Now do the math for the rest of the row
3978
ActiveMask.use = 0x0000ffffff000000;
3979
ShiftBpp.use = 24; // == 3 * 8
3980
ShiftRem.use = 40; // == 64 - 24
3982
"movl row, %%edi \n\t"
3983
"movq $ActiveMask, %%mm7 \n\t" // Load ActiveMask for 2nd active byte group
3984
"movl %%edi, %%esi \n\t" // lp = row
3985
"addl bpp, %%edi \n\t" // rp = row + bpp
3986
"movq %%mm7, %%mm6 \n\t"
3987
"movl _dif, %%ebx \n\t"
3988
"psllq $ShiftBpp, %%mm6 \n\t" // Move mask in mm6 to cover 3rd active
3990
// PRIME the pump (load the first Raw(x-bpp) data set
3991
"movq -8(%%edi,%%ebx,), %%mm1 \n\t"
3993
"psrlq $ShiftRem, %%mm1 \n\t" // Shift data for adding 1st bpp bytes
3994
// no need for mask; shift clears inactive bytes
3995
// Add 1st active group
3996
"movq (%%edi,%%ebx,), %%mm0 \n\t"
3997
"paddb %%mm1, %%mm0 \n\t"
3998
// Add 2nd active group
3999
"movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4000
"psllq $ShiftBpp, %%mm1 \n\t" // shift data to position correctly
4001
"pand %%mm7, %%mm1 \n\t" // mask to use only 2nd active group
4002
"paddb %%mm1, %%mm0 \n\t"
4003
// Add 3rd active group
4004
"movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4005
"psllq $ShiftBpp, %%mm1 \n\t" // shift data to position correctly
4006
"pand %%mm6, %%mm1 \n\t" // mask to use only 3rd active group
4007
"addl $8, %%ebx \n\t"
4008
"paddb %%mm1, %%mm0 \n\t"
4009
"cmpl _MMXLength, %%ebx \n\t"
4010
"movq %%mm0, -8(%%edi,%%ebx,) \n\t" // Write updated Raws back to array
4011
// Prep for doing 1st add at top of loop
4012
"movq %%mm0, %%mm1 \n\t"
4015
: // FIXASM: output regs/vars go here, e.g.: "=m" (memory_var)
4017
: // FIXASM: input regs, e.g.: "c" (count), "S" (src), "D" (dest)
4019
: "%ebx", "%edi", "%esi", "%mm0", "%mm1", "%mm6", "%mm7" // CHECKASM: clobber list
4026
// Placed here just in case this is a duplicate of the
4027
// non-MMX code for the SUB filter in png_read_filter_row above
4032
// bpp = (row_info->pixel_depth + 7) >> 3;
4033
// for (i = (png_uint_32)bpp, rp = row + bpp, lp = row;
4034
// i < row_info->rowbytes; i++, rp++, lp++)
4036
// *rp = (png_byte)(((int)(*rp) + (int)(*lp)) & 0xff);
4039
"movl _dif, %%ebx \n\t"
4040
"movl row, %%edi \n\t"
4041
"cmpl _FullLength, %%ebx \n\t"
4043
"movl %%edi, %%esi \n\t" // lp = row
4044
"xorl %%eax, %%eax \n\t"
4045
"addl bpp, %%edi \n\t" // rp = row + bpp
4047
"movb (%%esi,%%ebx,), %%al \n\t"
4048
"addb %%al, (%%edi,%%ebx,) \n\t"
4050
"cmpl _FullLength, %%ebx \n\t"
4054
: // FIXASM: output regs/vars go here, e.g.: "=m" (memory_var)
4056
: // FIXASM: input regs, e.g.: "c" (count), "S" (src), "D" (dest)
4058
: "%eax", "%ebx", "%edi", "%esi" // CHECKASM: clobber list
4068
ShiftBpp.use = bpp << 3;
4069
ShiftRem.use = 64 - ShiftBpp.use;
4071
"movl row, %%edi \n\t"
4072
"movl _dif, %%ebx \n\t"
4073
"movl %%edi, %%esi \n\t" // lp = row
4074
"addl bpp, %%edi \n\t" // rp = row + bpp
4075
// PRIME the pump (load the first Raw(x-bpp) data set
4076
"movq -8(%%edi,%%ebx,), %%mm1 \n\t"
4078
"psrlq $ShiftRem, %%mm1 \n\t" // Shift data for adding 1st bpp bytes
4079
// no need for mask; shift clears inactive bytes
4080
"movq (%%edi,%%ebx,), %%mm0 \n\t"
4081
"paddb %%mm1, %%mm0 \n\t"
4082
// Add 2nd active group
4083
"movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4084
"psllq $ShiftBpp, %%mm1 \n\t" // shift data to position correctly
4085
// there is no need for any mask
4086
// since shift clears inactive bits/bytes
4087
"addl $8, %%ebx \n\t"
4088
"paddb %%mm1, %%mm0 \n\t"
4089
"cmpl _MMXLength, %%ebx \n\t"
4090
"movq %%mm0, -8(%%edi,%%ebx,) \n\t"
4091
"movq %%mm0, %%mm1 \n\t" // Prep for doing 1st add at top of loop
4094
: // FIXASM: output regs/vars go here, e.g.: "=m" (memory_var)
4096
: // FIXASM: input regs, e.g.: "c" (count), "S" (src), "D" (dest)
4098
: "%ebx", "%edi", "%esi", "%mm0", "%mm1" // CHECKASM: clobber list
4105
ActiveMask.use = 0x00000000ffff0000;
4106
ShiftBpp.use = 16; // == 2 * 8
4107
ShiftRem.use = 48; // == 64 - 16
4109
"movq $ActiveMask, %%mm7 \n\t" // Load ActiveMask for 2nd active byte group
4110
"movl _dif, %%ebx \n\t"
4111
"movq %%mm7, %%mm6 \n\t"
4112
"movl row, %%edi \n\t"
4113
"psllq $ShiftBpp, %%mm6 \n\t" // Move mask in mm6 to cover 3rd active
4115
"movl %%edi, %%esi \n\t" // lp = row
4116
"movq %%mm6, %%mm5 \n\t"
4117
"addl bpp, %%edi \n\t" // rp = row + bpp
4118
"psllq $ShiftBpp, %%mm5 \n\t" // Move mask in mm5 to cover 4th active
4120
// PRIME the pump (load the first Raw(x-bpp) data set
4121
"movq -8(%%edi,%%ebx,), %%mm1 \n\t"
4123
// Add 1st active group
4124
"psrlq $ShiftRem, %%mm1 \n\t" // Shift data for adding 1st bpp bytes
4125
// no need for mask; shift clears inactive
4127
"movq (%%edi,%%ebx,), %%mm0 \n\t"
4128
"paddb %%mm1, %%mm0 \n\t"
4129
// Add 2nd active group
4130
"movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4131
"psllq $ShiftBpp, %%mm1 \n\t" // shift data to position correctly
4132
"pand %%mm7, %%mm1 \n\t" // mask to use only 2nd active group
4133
"paddb %%mm1, %%mm0 \n\t"
4134
// Add 3rd active group
4135
"movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4136
"psllq $ShiftBpp, %%mm1 \n\t" // shift data to position correctly
4137
"pand %%mm6, %%mm1 \n\t" // mask to use only 3rd active group
4138
"paddb %%mm1, %%mm0 \n\t"
4139
// Add 4th active group
4140
"movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4141
"psllq $ShiftBpp, %%mm1 \n\t" // shift data to position correctly
4142
"pand %%mm5, %%mm1 \n\t" // mask to use only 4th active group
4143
"addl $8, %%ebx \n\t"
4144
"paddb %%mm1, %%mm0 \n\t"
4145
"cmpl _MMXLength, %%ebx \n\t"
4146
"movq %%mm0, -8(%%edi,%%ebx,) \n\t" // Write updated Raws back to array
4147
"movq %%mm0, %%mm1 \n\t" // Prep for doing 1st add at top of loop
4150
: // FIXASM: output regs/vars go here, e.g.: "=m" (memory_var)
4152
: // FIXASM: input regs, e.g.: "c" (count), "S" (src), "D" (dest)
4154
: "%ebx", "%edi", "%esi", "%mm0", "%mm1", "%mm5", "%mm6", "%mm7" // CHECKASM: clobber list
4161
"movl row, %%edi \n\t"
4162
"movl _dif, %%ebx \n\t"
4163
"movl %%edi, %%esi \n\t" // lp = row
4164
"addl bpp, %%edi \n\t" // rp = row + bpp
4165
"movl _MMXLength, %%ecx \n\t"
4166
"movq -8(%%edi,%%ebx,), %%mm7 \n\t" // PRIME the pump (load the first
4167
// Raw(x-bpp) data set
4168
"andl $0x0000003f, %%ecx \n\t" // calc bytes over mult of 64
4170
"movq (%%edi,%%ebx,), %%mm0 \n\t" // Load Sub(x) for 1st 8 bytes
4171
"paddb %%mm7, %%mm0 \n\t"
4172
"movq 8(%%edi,%%ebx,), %%mm1 \n\t" // Load Sub(x) for 2nd 8 bytes
4173
"movq %%mm0, (%%edi,%%ebx,) \n\t" // Write Raw(x) for 1st 8 bytes
4174
// Now mm0 will be used as Raw(x-bpp) for
4175
// the 2nd group of 8 bytes. This will be
4176
// repeated for each group of 8 bytes with
4177
// the 8th group being used as the Raw(x-bpp)
4178
// for the 1st group of the next loop.
4179
"paddb %%mm0, %%mm1 \n\t"
4180
"movq 16(%%edi,%%ebx,), %%mm2 \n\t" // Load Sub(x) for 3rd 8 bytes
4181
"movq %%mm1, 8(%%edi,%%ebx,) \n\t" // Write Raw(x) for 2nd 8 bytes
4182
"paddb %%mm1, %%mm2 \n\t"
4183
"movq 24(%%edi,%%ebx,), %%mm3 \n\t" // Load Sub(x) for 4th 8 bytes
4184
"movq %%mm2, 16(%%edi,%%ebx,) \n\t" // Write Raw(x) for 3rd 8 bytes
4185
"paddb %%mm2, %%mm3 \n\t"
4186
"movq 32(%%edi,%%ebx,), %%mm4 \n\t" // Load Sub(x) for 5th 8 bytes
4187
"movq %%mm3, 24(%%edi,%%ebx,) \n\t" // Write Raw(x) for 4th 8 bytes
4188
"paddb %%mm3, %%mm4 \n\t"
4189
"movq 40(%%edi,%%ebx,), %%mm5 \n\t" // Load Sub(x) for 6th 8 bytes
4190
"movq %%mm4, 32(%%edi,%%ebx,) \n\t" // Write Raw(x) for 5th 8 bytes
4191
"paddb %%mm4, %%mm5 \n\t"
4192
"movq 48(%%edi,%%ebx,), %%mm6 \n\t" // Load Sub(x) for 7th 8 bytes
4193
"movq %%mm5, 40(%%edi,%%ebx,) \n\t" // Write Raw(x) for 6th 8 bytes
4194
"paddb %%mm5, %%mm6 \n\t"
4195
"movq 56(%%edi,%%ebx,), %%mm7 \n\t" // Load Sub(x) for 8th 8 bytes
4196
"movq %%mm6, 48(%%edi,%%ebx,) \n\t" // Write Raw(x) for 7th 8 bytes
4197
"addl $64, %%ebx \n\t"
4198
"paddb %%mm6, %%mm7 \n\t"
4199
"cmpl %%ecx, %%ebx \n\t"
4200
"movq %%mm7, -8(%%edi,%%ebx,) \n\t" // Write Raw(x) for 8th 8 bytes
4202
"cmpl _MMXLength, %%ebx \n\t"
4205
"movq (%%edi,%%ebx,), %%mm0 \n\t"
4206
"addl $8, %%ebx \n\t"
4207
"paddb %%mm7, %%mm0 \n\t"
4208
"cmpl _MMXLength, %%ebx \n\t"
4209
"movq %%mm0, -8(%%edi,%%ebx,) \n\t" // use -8 to offset early add to ebx
4210
"movq %%mm0, %%mm7 \n\t" // Move calculated Raw(x) data to mm1 to
4211
// be the new Raw(x-bpp) for the next loop
4215
: // FIXASM: output regs/vars go here, e.g.: "=m" (memory_var)
4217
: // FIXASM: input regs, e.g.: "c" (count), "S" (src), "D" (dest)
4219
: "%ebx", "%ecx", "%edi", "%esi", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" // CHECKASM: clobber list
4224
default: // bpp greater than 8 bytes
4227
"movl _dif, %%ebx \n\t"
4228
"movl row, %%edi \n\t"
4229
"movl %%edi, %%esi \n\t" // lp = row
4230
"addl bpp, %%edi \n\t" // rp = row + bpp
4232
"movq (%%edi,%%ebx,), %%mm0 \n\t"
4233
"movq (%%esi,%%ebx,), %%mm1 \n\t"
4234
"addl $8, %%ebx \n\t"
4235
"paddb %%mm1, %%mm0 \n\t"
4236
"cmpl _MMXLength, %%ebx \n\t"
4237
"movq %%mm0, -8(%%edi,%%ebx,) \n\t" // mov does not affect flags; -8 to offset
4241
: // FIXASM: output regs/vars go here, e.g.: "=m" (memory_var)
4243
: // FIXASM: input regs, e.g.: "c" (count), "S" (src), "D" (dest)
4245
: "%ebx", "%edi", "%esi", "%mm0", "%mm1" // CHECKASM: clobber list
4250
} // end switch ( bpp )
4253
"movl _MMXLength, %%ebx \n\t"
4254
"movl row, %%edi \n\t"
4255
"cmpl _FullLength, %%ebx \n\t"
4257
"movl %%edi, %%esi \n\t" // lp = row
4258
"xorl %%eax, %%eax \n\t"
4259
"addl bpp, %%edi \n\t" // rp = row + bpp
4261
"movb (%%esi,%%ebx,), %%al \n\t"
4262
"addb %%al, (%%edi,%%ebx,) \n\t"
4264
"cmpl _FullLength, %%ebx \n\t"
4267
"emms \n\t" // end MMX instructions
4269
: // FIXASM: output regs/vars go here, e.g.: "=m" (memory_var)
4271
: // FIXASM: input regs, e.g.: "c" (count), "S" (src), "D" (dest)
4273
: "%eax", "%ebx", "%edi", "%esi" // CHECKASM: clobber list
4275
#endif /* GRR_GCC_MMX_CONVERTED */
4278
// Optimized code for PNG Up filter decoder
4280
png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
4283
#ifdef GRR_GCC_MMX_CONVERTED
4286
len = row_info->rowbytes; // # of bytes to filter
4288
"movl row, %%edi \n\t"
4289
// get # of bytes to alignment
4290
"movl %%edi, %%ecx \n\t"
4291
"xorl %%ebx, %%ebx \n\t"
4292
"addl $0x7, %%ecx \n\t"
4293
"xorl %%eax, %%eax \n\t"
4294
"andl $0xfffffff8, %%ecx \n\t"
4295
"movl prev_row, %%esi \n\t"
4296
"subl %%edi, %%ecx \n\t"
4300
"movb (%%edi,%%ebx,), %%al \n\t"
4301
"addb (%%esi,%%ebx,), %%al \n\t"
4303
"cmpl %%ecx, %%ebx \n\t"
4304
"movb %%al, -1(%%edi,%%ebx,) \n\t" // mov does not affect flags; -1 to offset inc ebx
4307
"movl len, %%ecx \n\t"
4308
"movl %%ecx, %%edx \n\t"
4309
"subl %%ebx, %%edx \n\t" // subtract alignment fix
4310
"andl $0x0000003f, %%edx \n\t" // calc bytes over mult of 64
4311
"subl %%edx, %%ecx \n\t" // drop over bytes from length
4312
// Unrolled loop - use all MMX registers and interleave to reduce
4313
// number of branch instructions (loops) and reduce partial stalls
4315
"movq (%%esi,%%ebx,), %%mm1 \n\t"
4316
"movq (%%edi,%%ebx,), %%mm0 \n\t"
4317
"movq 8(%%esi,%%ebx,), %%mm3 \n\t"
4318
"paddb %%mm1, %%mm0 \n\t"
4319
"movq 8(%%edi,%%ebx,), %%mm2 \n\t"
4320
"movq %%mm0, (%%edi,%%ebx,) \n\t"
4321
"paddb %%mm3, %%mm2 \n\t"
4322
"movq 16(%%esi,%%ebx,), %%mm5 \n\t"
4323
"movq %%mm2, 8(%%edi,%%ebx,) \n\t"
4324
"movq 16(%%edi,%%ebx,), %%mm4 \n\t"
4325
"movq 24(%%esi,%%ebx,), %%mm7 \n\t"
4326
"paddb %%mm5, %%mm4 \n\t"
4327
"movq 24(%%edi,%%ebx,), %%mm6 \n\t"
4328
"movq %%mm4, 16(%%edi,%%ebx,) \n\t"
4329
"paddb %%mm7, %%mm6 \n\t"
4330
"movq 32(%%esi,%%ebx,), %%mm1 \n\t"
4331
"movq %%mm6, 24(%%edi,%%ebx,) \n\t"
4332
"movq 32(%%edi,%%ebx,), %%mm0 \n\t"
4333
"movq 40(%%esi,%%ebx,), %%mm3 \n\t"
4334
"paddb %%mm1, %%mm0 \n\t"
4335
"movq 40(%%edi,%%ebx,), %%mm2 \n\t"
4336
"movq %%mm0, 32(%%edi,%%ebx,) \n\t"
4337
"paddb %%mm3, %%mm2 \n\t"
4338
"movq 48(%%esi,%%ebx,), %%mm5 \n\t"
4339
"movq %%mm2, 40(%%edi,%%ebx,) \n\t"
4340
"movq 48(%%edi,%%ebx,), %%mm4 \n\t"
4341
"movq 56(%%esi,%%ebx,), %%mm7 \n\t"
4342
"paddb %%mm5, %%mm4 \n\t"
4343
"movq 56(%%edi,%%ebx,), %%mm6 \n\t"
4344
"movq %%mm4, 48(%%edi,%%ebx,) \n\t"
4345
"addl $64, %%ebx \n\t"
4346
"paddb %%mm7, %%mm6 \n\t"
4347
"cmpl %%ecx, %%ebx \n\t"
4348
"movq %%mm6, -8(%%edi,%%ebx,) \n\t" // (+56)movq does not affect flags;
4349
// -8 to offset add ebx
4352
"cmpl $0, %%edx \n\t" // Test for bytes over mult of 64
4356
// 2 lines added by lcreeve@netins.net
4357
// (mail 11 Jul 98 in png-implement list)
4358
"cmpl $8, %%edx \n\t" //test for less than 8 bytes
4362
"addl %%edx, %%ecx \n\t"
4363
"andl $0x00000007, %%edx \n\t" // calc bytes over mult of 8
4364
"subl %%edx, %%ecx \n\t" // drop over bytes from length
4366
// Loop using MMX registers mm0 & mm1 to update 8 bytes simultaneously
4368
"movq (%%esi,%%ebx,), %%mm1 \n\t"
4369
"movq (%%edi,%%ebx,), %%mm0 \n\t"
4370
"addl $8, %%ebx \n\t"
4371
"paddb %%mm1, %%mm0 \n\t"
4372
"cmpl %%ecx, %%ebx \n\t"
4373
"movq %%mm0, -8(%%edi,%%ebx,) \n\t" // movq does not affect flags; -8 to offset add ebx
4375
"cmpl $0, %%edx \n\t" // Test for bytes over mult of 8
4378
"xorl %%eax, %%eax \n\t"
4379
"addl %%edx, %%ecx \n\t" // move over byte count into counter
4380
// Loop using x86 registers to update remaining bytes
4382
"movb (%%edi,%%ebx,), %%al \n\t"
4383
"addb (%%esi,%%ebx,), %%al \n\t"
4385
"cmpl %%ecx, %%ebx \n\t"
4386
"movb %%al, -1(%%edi,%%ebx,) \n\t" // mov does not affect flags; -1 to offset inc ebx
4389
// Conversion of filtered row completed
4390
"emms \n\t" // End MMX instructions; prep for possible FP instrs.
4392
: // FIXASM: output regs/vars go here, e.g.: "=m" (memory_var)
4394
: // FIXASM: input regs, e.g.: "c" (count), "S" (src), "D" (dest)
4396
: "%eax", "%ebx", "%ecx", "%edx", "%edi", "%esi", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" // CHECKASM: clobber list
4398
#endif /* GRR_GCC_MMX_CONVERTED */
4402
#if defined(PNG_HAVE_ASSEMBLER_READ_FILTER_ROW)
4404
// Optimized png_read_filter_row routines
4407
png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
4408
row, png_bytep prev_row, int filter)
4415
if (mmx_supported == 2)
4416
mmx_supported = mmxsupport();
4418
#ifdef GRR_GCC_MMX_CONVERTED
4422
png_read_filter_row_c(png_ptr, row_info, row, prev_row, filter);
4427
png_debug(1, "in png_read_filter_row\n");
4429
png_debug1(0,"%s, ", "MMX");
4431
png_debug1(0,"%s, ", "x86");
4435
case 0: sprintf(filnm, "None ");
4437
case 1: sprintf(filnm, "Sub ");
4439
case 2: sprintf(filnm, "Up ");
4441
case 3: sprintf(filnm, "Avg ");
4443
case 4: sprintf(filnm, "Paeth");
4445
default: sprintf(filnm, "Unknw");
4448
png_debug2(0,"row=%5d, %s, ", png_ptr->row_number, filnm);
4449
png_debug2(0, "pd=%2d, b=%d, ", (int)row_info->pixel_depth,
4450
(int)((row_info->pixel_depth + 7) >> 3));
4451
png_debug1(0,"len=%8d, ", row_info->rowbytes);
4456
case PNG_FILTER_VALUE_NONE:
4459
case PNG_FILTER_VALUE_SUB:
4461
if ((row_info->pixel_depth > 8) && (row_info->rowbytes >= 128))
4463
png_read_filter_row_mmx_sub(row_info, row);
4469
png_uint_32 istop = row_info->rowbytes;
4470
png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
4471
png_bytep rp = row + bpp;
4474
for (i = bpp; i < istop; i++)
4476
*rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
4482
case PNG_FILTER_VALUE_UP:
4484
if ((row_info->pixel_depth > 8) && (row_info->rowbytes >= 128))
4486
png_read_filter_row_mmx_up(row_info, row, prev_row);
4494
for (i = 0, rp = row, pp = prev_row;
4495
i < row_info->rowbytes; i++, rp++, pp++)
4497
*rp = (png_byte)(((int)(*rp) + (int)(*pp)) & 0xff);
4502
case PNG_FILTER_VALUE_AVG:
4504
if ((row_info->pixel_depth > 8) && (row_info->rowbytes >= 128))
4506
png_read_filter_row_mmx_avg(row_info, row, prev_row);
4513
png_bytep pp = prev_row;
4515
png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
4516
png_uint_32 istop = row_info->rowbytes - bpp;
4518
for (i = 0; i < bpp; i++)
4520
*rp = (png_byte)(((int)(*rp) +
4521
((int)(*pp++) >> 1)) & 0xff);
4525
for (i = 0; i < istop; i++)
4527
*rp = (png_byte)(((int)(*rp) +
4528
((int)(*pp++ + *lp++) >> 1)) & 0xff);
4534
case PNG_FILTER_VALUE_PAETH:
4536
if ((row_info->pixel_depth > 8) && (row_info->rowbytes >= 128))
4538
png_read_filter_row_mmx_paeth(row_info, row, prev_row);
4545
png_bytep pp = prev_row;
4547
png_bytep cp = prev_row;
4548
png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
4549
png_uint_32 istop=row_info->rowbytes - bpp;
4551
for (i = 0; i < bpp; i++)
4553
*rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
4557
for (i = 0; i < istop; i++) // use leftover rp,pp
4559
int a, b, c, pa, pb, pc, p;
4573
pa = p < 0 ? -p : p;
4574
pb = pc < 0 ? -pc : pc;
4575
pc = (p + pc) < 0 ? -(p + pc) : p + pc;
4579
if (pa <= pb && pa <= pc)
4587
p = (pa <= pb && pa <=pc) ? a : (pb <= pc) ? b : c;
4589
*rp = (png_byte)(((int)(*rp) + p) & 0xff);
4596
png_warning(png_ptr, "Ignoring bad adaptive filter type");
4602
#endif /* PNG_HAVE_ASSEMBLER_READ_FILTER_ROW */
4605
// GRR NOTES: (1) the following code assumes 386 or better (pushfl/popfl)
4606
// (2) all instructions compile with gcc 2.7.2.3 and later
4607
// (3) the function is moved down here to prevent gcc from
4608
// inlining it in multiple places and then barfing be-
4609
// cause the ".NOT_SUPPORTED" label is multiply defined
4610
// [is there a way to signal that a *single* function should
4611
// not be inlined? is there a way to modify the label for
4612
// each inlined instance, e.g., by appending _1, _2, etc.?
4613
// maybe if don't use leading "." in label name? (not tested)]
4615
#ifdef ORIG_THAT_USED_TO_CLOBBER_EBX
4617
int mmxsupport(void)
4619
int mmx_supported_local = 0;
4622
// ".byte 0x66 \n\t" // convert 16-bit pushf to 32-bit pushfd
4623
// "pushf \n\t" // save Eflag to stack
4624
"pushfl \n\t" // save Eflag to stack
4625
"popl %%eax \n\t" // get Eflag from stack into eax
4626
"movl %%eax, %%ecx \n\t" // make another copy of Eflag in ecx
4627
"xorl $0x200000, %%eax \n\t" // toggle ID bit in Eflag (i.e., bit 21)
4628
"pushl %%eax \n\t" // save modified Eflag back to stack
4629
// ".byte 0x66 \n\t" // convert 16-bit popf to 32-bit popfd
4630
// "popf \n\t" // restore modified value to Eflag reg
4631
"popfl \n\t" // restore modified value to Eflag reg
4632
"pushfl \n\t" // save Eflag to stack
4633
"popl %%eax \n\t" // get Eflag from stack
4634
"xorl %%ecx, %%eax \n\t" // compare new Eflag with original Eflag
4635
"jz .NOT_SUPPORTED \n\t" // if same, CPUID instr. is not supported
4637
"xorl %%eax, %%eax \n\t" // set eax to zero
4638
// ".byte 0x0f, 0xa2 \n\t" // CPUID instruction (two-byte opcode)
4639
"cpuid \n\t" // get the CPU identification info
4640
"cmpl $1, %%eax \n\t" // make sure eax return non-zero value
4641
"jl .NOT_SUPPORTED \n\t" // if eax is zero, MMX is not supported
4643
"xorl %%eax, %%eax \n\t" // set eax to zero and...
4644
"incl %%eax \n\t" // ...increment eax to 1. This pair is
4645
// faster than the instruction "mov eax, 1"
4646
"cpuid \n\t" // get the CPU identification info again
4647
"andl $0x800000, %%edx \n\t" // mask out all bits but MMX bit (23)
4648
"cmpl $0, %%edx \n\t" // 0 = MMX not supported
4649
"jz .NOT_SUPPORTED \n\t" // non-zero = yes, MMX IS supported
4651
"movl $1, %0 \n\t" // set return value to 1 and fall through
4653
".NOT_SUPPORTED: \n\t" // target label for jump instructions
4654
"movl %0, %%eax \n\t" // move return value to eax
4657
: "=m" (mmx_supported_local) // %0 (output list: memory only)
4659
: // any variables used on input (none)
4661
: "%eax", "%ebx", // clobber list
4663
// , "memory" // if write to a variable gcc thought was in a reg
4664
// , "cc" // "condition codes" (flag bits)
4667
//mmx_supported_local=0; // test code for force don't support MMX
4668
//printf("MMX : %u (1=MMX supported)\n",mmx_supported_local);
4670
return mmx_supported_local;
4673
#else /* !ORIG_THAT_USED_TO_CLOBBER_EBX */
4675
int mmxsupport(void)
4678
"pushl %%ebx \n\t" // ebx gets clobbered by CPUID instruction
4679
"pushl %%ecx \n\t" // so does ecx...
4680
"pushl %%edx \n\t" // ...and edx (but ecx & edx safe on Linux)
4681
// ".byte 0x66 \n\t" // convert 16-bit pushf to 32-bit pushfd
4682
// "pushf \n\t" // save Eflag to stack
4683
"pushfl \n\t" // save Eflag to stack
4684
"popl %%eax \n\t" // get Eflag from stack into eax
4685
"movl %%eax, %%ecx \n\t" // make another copy of Eflag in ecx
4686
"xorl $0x200000, %%eax \n\t" // toggle ID bit in Eflag (i.e., bit 21)
4687
"pushl %%eax \n\t" // save modified Eflag back to stack
4688
// ".byte 0x66 \n\t" // convert 16-bit popf to 32-bit popfd
4689
// "popf \n\t" // restore modified value to Eflag reg
4690
"popfl \n\t" // restore modified value to Eflag reg
4691
"pushfl \n\t" // save Eflag to stack
4692
"popl %%eax \n\t" // get Eflag from stack
4693
"xorl %%ecx, %%eax \n\t" // compare new Eflag with original Eflag
4694
"jz .NOT_SUPPORTED \n\t" // if same, CPUID instr. is not supported
4696
"xorl %%eax, %%eax \n\t" // set eax to zero
4697
// ".byte 0x0f, 0xa2 \n\t" // CPUID instruction (two-byte opcode)
4698
"cpuid \n\t" // get the CPU identification info
4699
"cmpl $1, %%eax \n\t" // make sure eax return non-zero value
4700
"jl .NOT_SUPPORTED \n\t" // if eax is zero, MMX is not supported
4702
"xorl %%eax, %%eax \n\t" // set eax to zero and...
4703
"incl %%eax \n\t" // ...increment eax to 1. This pair is
4704
// faster than the instruction "mov eax, 1"
4705
"cpuid \n\t" // get the CPU identification info again
4706
"andl $0x800000, %%edx \n\t" // mask out all bits but MMX bit (23)
4707
"cmpl $0, %%edx \n\t" // 0 = MMX not supported
4708
"jz .NOT_SUPPORTED \n\t" // non-zero = yes, MMX IS supported
4710
"movl $1, %%eax \n\t" // set return value to 1
4711
"popl %%edx \n\t" // restore edx
4712
"popl %%ecx \n\t" // restore ecx
4713
"popl %%ebx \n\t" // restore ebx ("row" in png_do_interlace)
4714
"ret \n\t" // DONE: have MMX support
4716
".NOT_SUPPORTED: \n\t" // target label for jump instructions
4717
"movl $0, %%eax \n\t" // set return value to 0
4718
"popl %%edx \n\t" // restore edx
4719
"popl %%ecx \n\t" // restore ecx
4720
"popl %%ebx \n\t" // restore ebx ("row" in png_do_interlace)
4721
// "ret \n\t" // DONE: no MMX support
4722
// (fall through to standard C "ret")
4724
: // "=m" (mmx_supported_local) // %0 (output list: memory only)
4726
: // any variables used on input (none)
4728
: "%eax" // clobber list
4729
// , "%ebx", "%ecx", "%edx" // GRR: we handle these manually
4730
// , "memory" // if write to a variable gcc thought was in a reg
4731
// , "cc" // "condition codes" (flag bits)
4734
//mmx_supported_local=0; // test code for force don't support MMX
4735
//printf("MMX : %u (1=MMX supported)\n",mmx_supported_local);
4737
//return mmx_supported_local;
4740
#endif /* ?ORIG_THAT_USED_TO_CLOBBER_EBX */
4742
#endif /* PNG_ASSEMBLER_CODE_SUPPORTED && PNG_USE_PNGGCCRD */