1
/* pngvcrd.c - mixed C/assembler version of utilities to read a PNG file
3
* For Intel x86 CPU and Microsoft Visual C++ compiler
5
* libpng 1.0.8 - July 24, 2000
6
* For conditions of distribution and use, see copyright notice in png.h
7
* Copyright (c) 1998, 1999, 2000 Glenn Randers-Pehrson
8
* Copyright (c) 1998, Intel Corporation
10
* Contributed by Nirav Chhatrapati, Intel Corporation, 1998
11
* Interface to libpng contributed by Gilles Vollant, 1999
18
#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_USE_PNGVCRD)
21
One of these might need to be defined.
22
#define DISABLE_PNGVCRD_COMBINE
23
#define DISABLE_PNGVCRD_INTERLACE
26
static int mmx_supported=2;
29
png_read_filter_row_c(png_structp png_ptr, png_row_infop row_info,
30
png_bytep row, png_bytep prev_row, int filter);
32
static int mmxsupport()
34
int mmx_supported_local = 0;
36
push ebx //CPUID will trash these
39
pushfd //Save Eflag to stack
40
pop eax //Get Eflag from stack into eax
41
mov ecx, eax //Make another copy of Eflag in ecx
42
xor eax, 0x200000 //Toggle ID bit in Eflag [i.e. bit(21)]
43
push eax //Save modified Eflag back to stack
45
popfd //Restored modified value back to Eflag reg
46
pushfd //Save Eflag to stack
47
pop eax //Get Eflag from stack
48
xor eax, ecx //Compare the new Eflag with the original Eflag
49
jz NOT_SUPPORTED //If the same, CPUID instruction is not supported,
50
//skip following instructions and jump to
53
xor eax, eax //Set eax to zero
55
_asm _emit 0x0f //CPUID instruction (two bytes opcode)
58
cmp eax, 1 //make sure eax return non-zero value
59
jl NOT_SUPPORTED //If eax is zero, mmx not supported
61
xor eax, eax //set eax to zero
62
inc eax //Now increment eax to 1. This instruction is
63
//faster than the instruction "mov eax, 1"
65
_asm _emit 0x0f //CPUID instruction
68
and edx, 0x00800000 //mask out all bits but mmx bit(24)
69
cmp edx, 0 // 0 = mmx not supported
70
jz NOT_SUPPORTED // non-zero = Yes, mmx IS supported
72
mov mmx_supported_local, 1 //set return value to 1
75
mov eax, mmx_supported_local //move return value to eax
76
pop edx //CPUID trashed these
81
//mmx_supported_local=0; // test code for force don't support MMX
82
//printf("MMX : %u (1=MMX supported)\n",mmx_supported_local);
84
return mmx_supported_local;
87
/* Combines the row recently read in with the previous row.
88
This routine takes care of alpha and transparency if requested.
89
This routine also handles the two methods of progressive display
90
of interlaced images, depending on the mask value.
91
The mask value describes which pixels are to be combined with
92
the row. The pattern always repeats every 8 pixels, so just 8
93
bits are needed. A one indicates the pixel is to be combined; a
94
zero indicates the pixel is to be skipped. This is in addition
95
to any alpha or transparency value associated with the pixel. If
96
you want all pixels to be combined, pass 0xff (255) in mask. */
98
/* Use this routine for x86 platform - uses faster MMX routine if machine
102
png_combine_row(png_structp png_ptr, png_bytep row, int mask)
104
#ifdef PNG_USE_LOCAL_ARRAYS
105
const int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
107
#ifdef DISABLE_PNGVCRD_COMBINE
108
int save_mmx_supported = mmx_supported;
111
png_debug(1,"in png_combine_row_asm\n");
113
#ifdef DISABLE_PNGVCRD_COMBINE
114
if ((png_ptr->transformations & PNG_INTERLACE) && png_ptr->pass != 6)
118
if (mmx_supported == 2)
119
mmx_supported = mmxsupport();
123
png_memcpy(row, png_ptr->row_buf + 1,
124
(png_size_t)((png_ptr->width * png_ptr->row_info.pixel_depth + 7) >> 3));
126
/* GRR: add "else if (mask == 0)" case?
127
* or does png_combine_row() not even get called in that case? */
130
switch (png_ptr->row_info.pixel_depth)
136
int s_inc, s_start, s_end;
141
sp = png_ptr->row_buf + 1;
144
#if defined(PNG_READ_PACKSWAP_SUPPORTED)
145
if (png_ptr->transformations & PNG_PACKSWAP)
161
for (i = 0; i < png_ptr->width; i++)
167
value = (*sp >> shift) & 0x1;
168
*dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
169
*dp |= (png_byte)(value << shift);
193
int s_start, s_end, s_inc;
199
sp = png_ptr->row_buf + 1;
202
#if defined(PNG_READ_PACKSWAP_SUPPORTED)
203
if (png_ptr->transformations & PNG_PACKSWAP)
219
for (i = 0; i < png_ptr->width; i++)
223
value = (*sp >> shift) & 0x3;
224
*dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
225
*dp |= (png_byte)(value << shift);
248
int s_start, s_end, s_inc;
254
sp = png_ptr->row_buf + 1;
257
#if defined(PNG_READ_PACKSWAP_SUPPORTED)
258
if (png_ptr->transformations & PNG_PACKSWAP)
273
for (i = 0; i < png_ptr->width; i++)
277
value = (*sp >> shift) & 0xf;
278
*dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
279
*dp |= (png_byte)(value << shift);
306
__int64 mask0=0x0102040810204080;
310
srcptr = png_ptr->row_buf + 1;
314
len = png_ptr->width &~7; //reduce to multiple of 8
315
diff = png_ptr->width & 7; //amount lost
319
movd mm7, unmask //load bit pattern
320
psubb mm6,mm6 //zero mm6
323
punpckldq mm7,mm7 //fill register with 8 masks
327
pand mm0,mm7 //nonzero if keep byte
328
pcmpeqb mm0,mm6 //zeros->1s, v versa
330
mov ecx,len //load length of line (pixels)
331
mov esi,srcptr //load source
332
mov ebx,dstptr //load dest
344
add esi,8 //inc by 8 bytes processed
346
sub ecx,8 //dec by 8 pixels processed
356
sal edx,24 //make low byte the high byte
359
sal edx,1 //move high bit to CF
360
jnc skip8 //if CF = 0
373
else /* mmx not supported - use modified C routine */
375
register unsigned int incr1, initial_val, final_val;
376
png_size_t pixel_bytes;
378
register int disp = png_pass_inc[png_ptr->pass];
379
int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
381
pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
382
srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
384
dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
385
initial_val = offset_table[png_ptr->pass]*pixel_bytes;
386
final_val = png_ptr->width*pixel_bytes;
387
incr1 = (disp)*pixel_bytes;
388
for (i = initial_val; i < final_val; i += incr1)
390
png_memcpy(dstptr, srcptr, pixel_bytes);
405
__int64 mask1=0x0101020204040808,
406
mask0=0x1010202040408080;
410
srcptr = png_ptr->row_buf + 1;
414
len = (png_ptr->width)&~7;
415
diff = (png_ptr->width)&7;
418
movd mm7, unmask //load bit pattern
419
psubb mm6,mm6 //zero mm6
422
punpckldq mm7,mm7 //fill register with 8 masks
433
mov ecx,len //load length of line
434
mov esi,srcptr //load source
435
mov ebx,dstptr //load dest
456
add esi,16 //inc by 16 bytes processed
458
sub ecx,8 //dec by 8 pixels processed
468
sal edx,24 //make low byte the high byte
470
sal edx,1 //move high bit to CF
471
jnc skip16 //if CF = 0
484
else /* mmx not supported - use modified C routine */
486
register unsigned int incr1, initial_val, final_val;
487
png_size_t pixel_bytes;
489
register int disp = png_pass_inc[png_ptr->pass];
490
int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
492
pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
493
srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
495
dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
496
initial_val = offset_table[png_ptr->pass]*pixel_bytes;
497
final_val = png_ptr->width*pixel_bytes;
498
incr1 = (disp)*pixel_bytes;
499
for (i = initial_val; i < final_val; i += incr1)
501
png_memcpy(dstptr, srcptr, pixel_bytes);
517
__int64 mask2=0x0101010202020404, //24bpp
518
mask1=0x0408080810101020,
519
mask0=0x2020404040808080;
521
srcptr = png_ptr->row_buf + 1;
525
len = (png_ptr->width)&~7;
526
diff = (png_ptr->width)&7;
532
movd mm7, unmask //load bit pattern
533
psubb mm6,mm6 //zero mm6
536
punpckldq mm7,mm7 //fill register with 8 masks
550
mov ecx,len //load length of line
551
mov esi,srcptr //load source
552
mov ebx,dstptr //load dest
582
add esi,24 //inc by 24 bytes processed
584
sub ecx,8 //dec by 8 pixels processed
594
sal edx,24 //make low byte the high byte
596
sal edx,1 //move high bit to CF
597
jnc skip24 //if CF = 0
614
else /* mmx not supported - use modified C routine */
616
register unsigned int incr1, initial_val, final_val;
617
png_size_t pixel_bytes;
619
register int disp = png_pass_inc[png_ptr->pass];
620
int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
622
pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
623
srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
625
dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
626
initial_val = offset_table[png_ptr->pass]*pixel_bytes;
627
final_val = png_ptr->width*pixel_bytes;
628
incr1 = (disp)*pixel_bytes;
629
for (i = initial_val; i < final_val; i += incr1)
631
png_memcpy(dstptr, srcptr, pixel_bytes);
647
__int64 mask3=0x0101010102020202, //32bpp
648
mask2=0x0404040408080808,
649
mask1=0x1010101020202020,
650
mask0=0x4040404080808080;
652
srcptr = png_ptr->row_buf + 1;
656
len = (png_ptr->width)&~7;
657
diff = (png_ptr->width)&7;
663
movd mm7, unmask //load bit pattern
664
psubb mm6,mm6 //zero mm6
667
punpckldq mm7,mm7 //fill register with 8 masks
684
mov ecx,len //load length of line
685
mov esi,srcptr //load source
686
mov ebx,dstptr //load dest
724
add esi,32 //inc by 32 bytes processed
726
sub ecx,8 //dec by 8 pixels processed
736
sal edx,24 //make low byte the high byte
738
sal edx,1 //move high bit to CF
739
jnc skip32 //if CF = 0
753
else /* mmx _not supported - Use modified C routine */
755
register unsigned int incr1, initial_val, final_val;
756
png_size_t pixel_bytes;
758
register int disp = png_pass_inc[png_ptr->pass];
759
int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
761
pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
762
srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
764
dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
765
initial_val = offset_table[png_ptr->pass]*pixel_bytes;
766
final_val = png_ptr->width*pixel_bytes;
767
incr1 = (disp)*pixel_bytes;
768
for (i = initial_val; i < final_val; i += incr1)
770
png_memcpy(dstptr, srcptr, pixel_bytes);
786
__int64 mask5=0x0101010101010202,
787
mask4=0x0202020204040404,
788
mask3=0x0404080808080808,
789
mask2=0x1010101010102020,
790
mask1=0x2020202040404040,
791
mask0=0x4040808080808080;
795
srcptr = png_ptr->row_buf + 1;
799
len = (png_ptr->width)&~7;
800
diff = (png_ptr->width)&7;
803
movd mm7, unmask //load bit pattern
804
psubb mm6,mm6 //zero mm6
807
punpckldq mm7,mm7 //fill register with 8 masks
830
mov ecx,len //load length of line
831
mov esi,srcptr //load source
832
mov ebx,dstptr //load dest
880
add esi,48 //inc by 32 bytes processed
882
sub ecx,8 //dec by 8 pixels processed
892
sal edx,24 //make low byte the high byte
895
sal edx,1 //move high bit to CF
896
jnc skip48 //if CF = 0
910
else /* mmx _not supported - Use modified C routine */
912
register unsigned int incr1, initial_val, final_val;
913
png_size_t pixel_bytes;
915
register int disp = png_pass_inc[png_ptr->pass];
916
int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
918
pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
919
srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
921
dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
922
initial_val = offset_table[png_ptr->pass]*pixel_bytes;
923
final_val = png_ptr->width*pixel_bytes;
924
incr1 = (disp)*pixel_bytes;
925
for (i = initial_val; i < final_val; i += incr1)
927
png_memcpy(dstptr, srcptr, pixel_bytes);
940
png_size_t pixel_bytes;
941
int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
943
register int disp = png_pass_inc[png_ptr->pass]; // get the offset
944
register unsigned int incr1, initial_val, final_val;
946
pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
947
sptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
949
dp = row + offset_table[png_ptr->pass]*pixel_bytes;
950
initial_val = offset_table[png_ptr->pass]*pixel_bytes;
951
final_val = png_ptr->width*pixel_bytes;
952
incr1 = (disp)*pixel_bytes;
953
for (i = initial_val; i < final_val; i += incr1)
955
png_memcpy(dp, sptr, pixel_bytes);
961
} /* end switch (png_ptr->row_info.pixel_depth) */
962
} /* end if (non-trivial mask) */
964
#ifdef DISABLE_PNGVCRD_COMBINE
965
mmx_supported = save_mmx_supported;
968
} /* end png_combine_row() */
971
#if defined(PNG_READ_INTERLACING_SUPPORTED)
974
png_do_read_interlace(png_row_infop row_info, png_bytep row, int pass,
975
png_uint_32 transformations)
977
#ifdef PNG_USE_LOCAL_ARRAYS
978
const int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
980
#ifdef DISABLE_PNGVCRD_INTERLACE
981
int save_mmx_supported = mmx_supported;
984
png_debug(1,"in png_do_read_interlace\n");
986
#ifdef DISABLE_PNGVCRD_INTERLACE
987
/* In libpng versions 1.0.3a through 1.0.4d,
988
* a sign error in the post-MMX cleanup code for each pixel_depth resulted
989
* in bad pixels at the beginning of some rows of some images, and also
990
* (due to out-of-range memory reads and writes) caused heap corruption
991
* when compiled with MSVC 6.0. The error was fixed in version 1.0.4e,
992
* and the code appears to work completely correctly, so it is enabled
995
if (1) /* all passes caused a heap problem in the old code */
999
if (mmx_supported == 2)
1000
mmx_supported = mmxsupport();
1002
if (row != NULL && row_info != NULL)
1004
png_uint_32 final_width;
1006
final_width = row_info->width * png_pass_inc[pass];
1008
switch (row_info->pixel_depth)
1014
int s_start, s_end, s_inc;
1019
sp = row + (png_size_t)((row_info->width - 1) >> 3);
1020
dp = row + (png_size_t)((final_width - 1) >> 3);
1021
#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1022
if (transformations & PNG_PACKSWAP)
1024
sshift = (int)((row_info->width + 7) & 7);
1025
dshift = (int)((final_width + 7) & 7);
1033
sshift = 7 - (int)((row_info->width + 7) & 7);
1034
dshift = 7 - (int)((final_width + 7) & 7);
1040
for (i = row_info->width; i; i--)
1042
v = (png_byte)((*sp >> sshift) & 0x1);
1043
for (j = 0; j < png_pass_inc[pass]; j++)
1045
*dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
1046
*dp |= (png_byte)(v << dshift);
1047
if (dshift == s_end)
1055
if (sshift == s_end)
1070
int s_start, s_end, s_inc;
1073
sp = row + (png_size_t)((row_info->width - 1) >> 2);
1074
dp = row + (png_size_t)((final_width - 1) >> 2);
1075
#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1076
if (transformations & PNG_PACKSWAP)
1078
sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
1079
dshift = (png_size_t)(((final_width + 3) & 3) << 1);
1087
sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
1088
dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
1094
for (i = row_info->width; i; i--)
1099
v = (png_byte)((*sp >> sshift) & 0x3);
1100
for (j = 0; j < png_pass_inc[pass]; j++)
1102
*dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
1103
*dp |= (png_byte)(v << dshift);
1104
if (dshift == s_end)
1112
if (sshift == s_end)
1127
int s_start, s_end, s_inc;
1130
sp = row + (png_size_t)((row_info->width - 1) >> 1);
1131
dp = row + (png_size_t)((final_width - 1) >> 1);
1132
#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1133
if (transformations & PNG_PACKSWAP)
1135
sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
1136
dshift = (png_size_t)(((final_width + 1) & 1) << 2);
1144
sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
1145
dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
1151
for (i = row_info->width; i; i--)
1156
v = (png_byte)((*sp >> sshift) & 0xf);
1157
for (j = 0; j < png_pass_inc[pass]; j++)
1159
*dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
1160
*dp |= (png_byte)(v << dshift);
1161
if (dshift == s_end)
1169
if (sshift == s_end)
1180
default: // This is the place where the routine is modified
1182
__int64 const4 = 0x0000000000FFFFFF;
1183
// __int64 const5 = 0x000000FFFFFF0000; // unused...
1184
__int64 const6 = 0x00000000000000FF;
1187
png_size_t pixel_bytes;
1188
int width = row_info->width;
1190
pixel_bytes = (row_info->pixel_depth >> 3);
1192
sptr = row + (width - 1) * pixel_bytes;
1193
dp = row + (final_width - 1) * pixel_bytes;
1194
// New code by Nirav Chhatrapati - Intel Corporation
1196
// NOTE: there is NO MMX code for 48-bit and 64-bit images
1198
if (mmx_supported) // use MMX routine if machine supports it
1200
if (pixel_bytes == 3)
1202
if (((pass == 0) || (pass == 1)) && width)
1209
sub edi, 21 // (png_pass_inc[pass] - 1)*pixel_bytes
1211
movd mm0, [esi] ; X X X X X v2 v1 v0
1212
pand mm0, const4 ; 0 0 0 0 0 v2 v1 v0
1213
movq mm1, mm0 ; 0 0 0 0 0 v2 v1 v0
1214
psllq mm0, 16 ; 0 0 0 v2 v1 v0 0 0
1215
movq mm2, mm0 ; 0 0 0 v2 v1 v0 0 0
1216
psllq mm0, 24 ; v2 v1 v0 0 0 0 0 0
1217
psrlq mm1, 8 ; 0 0 0 0 0 0 v2 v1
1218
por mm0, mm2 ; v2 v1 v0 v2 v1 v0 0 0
1219
por mm0, mm1 ; v2 v1 v0 v2 v1 v0 v2 v1
1220
movq mm3, mm0 ; v2 v1 v0 v2 v1 v0 v2 v1
1221
psllq mm0, 16 ; v0 v2 v1 v0 v2 v1 0 0
1222
movq mm4, mm3 ; v2 v1 v0 v2 v1 v0 v2 v1
1223
punpckhdq mm3, mm0 ; v0 v2 v1 v0 v2 v1 v0 v2
1225
psrlq mm0, 32 ; 0 0 0 0 v0 v2 v1 v0
1227
punpckldq mm0, mm4 ; v1 v0 v2 v1 v0 v2 v1 v0
1237
else if (((pass == 2) || (pass == 3)) && width)
1244
sub edi, 9 // (png_pass_inc[pass] - 1)*pixel_bytes
1246
movd mm0, [esi] ; X X X X X v2 v1 v0
1247
pand mm0, const4 ; 0 0 0 0 0 v2 v1 v0
1248
movq mm1, mm0 ; 0 0 0 0 0 v2 v1 v0
1249
psllq mm0, 16 ; 0 0 0 v2 v1 v0 0 0
1250
movq mm2, mm0 ; 0 0 0 v2 v1 v0 0 0
1251
psllq mm0, 24 ; v2 v1 v0 0 0 0 0 0
1252
psrlq mm1, 8 ; 0 0 0 0 0 0 v2 v1
1253
por mm0, mm2 ; v2 v1 v0 v2 v1 v0 0 0
1254
por mm0, mm1 ; v2 v1 v0 v2 v1 v0 v2 v1
1255
movq [edi+4], mm0 ; move to memory
1256
psrlq mm0, 16 ; 0 0 v2 v1 v0 v2 v1 v0
1257
movd [edi], mm0 ; move to memory
1265
else if (width) /* && ((pass == 4) || (pass == 5)) */
1267
int width_mmx = ((width >> 1) << 1) - 8;
1270
width -= width_mmx; // 8 or 9 pix, 24 or 27 bytes
1281
movq mm0, [esi] ; X X v2 v1 v0 v5 v4 v3
1282
movq mm7, mm0 ; X X v2 v1 v0 v5 v4 v3
1283
movq mm6, mm0 ; X X v2 v1 v0 v5 v4 v3
1284
psllq mm0, 24 ; v1 v0 v5 v4 v3 0 0 0
1285
pand mm7, const4 ; 0 0 0 0 0 v5 v4 v3
1286
psrlq mm6, 24 ; 0 0 0 X X v2 v1 v0
1287
por mm0, mm7 ; v1 v0 v5 v4 v3 v5 v4 v3
1288
movq mm5, mm6 ; 0 0 0 X X v2 v1 v0
1289
psllq mm6, 8 ; 0 0 X X v2 v1 v0 0
1290
movq [edi], mm0 ; move quad to memory
1291
psrlq mm5, 16 ; 0 0 0 0 0 X X v2
1292
pand mm5, const6 ; 0 0 0 0 0 0 0 v2
1293
por mm6, mm5 ; 0 0 X X v2 v1 v0 v2
1294
movd [edi+8], mm6 ; move double to memory
1303
sptr -= width_mmx*3;
1305
for (i = width; i; i--)
1310
png_memcpy(v, sptr, 3);
1311
for (j = 0; j < png_pass_inc[pass]; j++)
1313
png_memcpy(dp, v, 3);
1319
} /* end of pixel_bytes == 3 */
1321
else if (pixel_bytes == 1)
1323
if (((pass == 0) || (pass == 1)) && width)
1325
int width_mmx = ((width >> 2) << 2);
1337
movd mm0, [esi] ; X X X X v0 v1 v2 v3
1338
movq mm1, mm0 ; X X X X v0 v1 v2 v3
1339
punpcklbw mm0, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1340
movq mm2, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1341
punpcklwd mm0, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
1342
movq mm3, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
1343
punpckldq mm0, mm0 ; v3 v3 v3 v3 v3 v3 v3 v3
1344
punpckhdq mm3, mm3 ; v2 v2 v2 v2 v2 v2 v2 v2
1345
movq [edi], mm0 ; move to memory v3
1346
punpckhwd mm2, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1
1347
movq [edi+8], mm3 ; move to memory v2
1348
movq mm4, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1
1349
punpckldq mm2, mm2 ; v1 v1 v1 v1 v1 v1 v1 v1
1350
punpckhdq mm4, mm4 ; v0 v0 v0 v0 v0 v0 v0 v0
1351
movq [edi+16], mm2 ; move to memory v1
1352
movq [edi+24], mm4 ; move to memory v0
1363
for (i = width; i; i--)
1367
/* I simplified this part in version 1.0.4e
1368
* here and in several other instances where
1369
* pixel_bytes == 1 -- GR-P
1374
* png_memcpy(v, sptr, pixel_bytes);
1375
* for (j = 0; j < png_pass_inc[pass]; j++)
1377
* png_memcpy(dp, v, pixel_bytes);
1378
* dp -= pixel_bytes;
1380
* sptr -= pixel_bytes;
1382
* Replacement code is in the next three lines:
1385
for (j = 0; j < png_pass_inc[pass]; j++)
1390
else if (((pass == 2) || (pass == 3)) && width)
1392
int width_mmx = ((width >> 2) << 2);
1404
movd mm0, [esi] ; X X X X v0 v1 v2 v3
1405
punpcklbw mm0, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1406
movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1407
punpcklwd mm0, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
1408
punpckhwd mm1, mm1 ; v0 v0 v0 v0 v1 v1 v1 v1
1409
movq [edi], mm0 ; move to memory v2 and v3
1411
movq [edi+8], mm1 ; move to memory v1 and v0
1421
for (i = width; i; i--)
1425
for (j = 0; j < png_pass_inc[pass]; j++)
1432
else if (width) /* && ((pass == 4) || (pass == 5))) */
1434
int width_mmx = ((width >> 3) << 3);
1446
movq mm0, [esi] ; v0 v1 v2 v3 v4 v5 v6 v7
1447
movq mm1, mm0 ; v0 v1 v2 v3 v4 v5 v6 v7
1448
punpcklbw mm0, mm0 ; v4 v4 v5 v5 v6 v6 v7 v7
1449
//movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1450
punpckhbw mm1, mm1 ;v0 v0 v1 v1 v2 v2 v3 v3
1451
movq [edi+8], mm1 ; move to memory v0 v1 v2 and v3
1453
movq [edi], mm0 ; move to memory v4 v5 v6 and v7
1464
for (i = width; i; i--)
1468
for (j = 0; j < png_pass_inc[pass]; j++)
1475
} /* end of pixel_bytes == 1 */
1477
else if (pixel_bytes == 2)
1479
if (((pass == 0) || (pass == 1)) && width)
1481
int width_mmx = ((width >> 1) << 1);
1493
movd mm0, [esi] ; X X X X v1 v0 v3 v2
1494
punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1495
movq mm1, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1496
punpckldq mm0, mm0 ; v3 v2 v3 v2 v3 v2 v3 v2
1497
punpckhdq mm1, mm1 ; v1 v0 v1 v0 v1 v0 v1 v0
1500
movq [edi + 16], mm1
1501
movq [edi + 24], mm1
1510
sptr -= (width_mmx*2 - 2); // sign fixed
1511
dp -= (width_mmx*16 - 2); // sign fixed
1512
for (i = width; i; i--)
1517
png_memcpy(v, sptr, 2);
1518
for (j = 0; j < png_pass_inc[pass]; j++)
1521
png_memcpy(dp, v, 2);
1525
else if (((pass == 2) || (pass == 3)) && width)
1527
int width_mmx = ((width >> 1) << 1) ;
1539
movd mm0, [esi] ; X X X X v1 v0 v3 v2
1540
punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1541
movq mm1, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1542
punpckldq mm0, mm0 ; v3 v2 v3 v2 v3 v2 v3 v2
1543
punpckhdq mm1, mm1 ; v1 v0 v1 v0 v1 v0 v1 v0
1555
sptr -= (width_mmx*2 - 2); // sign fixed
1556
dp -= (width_mmx*8 - 2); // sign fixed
1557
for (i = width; i; i--)
1562
png_memcpy(v, sptr, 2);
1563
for (j = 0; j < png_pass_inc[pass]; j++)
1566
png_memcpy(dp, v, 2);
1570
else if (width) // pass == 4 or 5
1572
int width_mmx = ((width >> 1) << 1) ;
1584
movd mm0, [esi] ; X X X X v1 v0 v3 v2
1585
punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1595
sptr -= (width_mmx*2 - 2); // sign fixed
1596
dp -= (width_mmx*4 - 2); // sign fixed
1597
for (i = width; i; i--)
1602
png_memcpy(v, sptr, 2);
1603
for (j = 0; j < png_pass_inc[pass]; j++)
1606
png_memcpy(dp, v, 2);
1610
} /* end of pixel_bytes == 2 */
1612
else if (pixel_bytes == 4)
1614
if (((pass == 0) || (pass == 1)) && width)
1616
int width_mmx = ((width >> 1) << 1) ;
1628
movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
1629
movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
1630
punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
1631
punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
1634
movq [edi + 16], mm0
1635
movq [edi + 24], mm0
1637
movq [edi + 40], mm1
1640
movq [edi + 56], mm1
1648
sptr -= (width_mmx*4 - 4); // sign fixed
1649
dp -= (width_mmx*32 - 4); // sign fixed
1650
for (i = width; i; i--)
1655
png_memcpy(v, sptr, 4);
1656
for (j = 0; j < png_pass_inc[pass]; j++)
1659
png_memcpy(dp, v, 4);
1663
else if (((pass == 2) || (pass == 3)) && width)
1665
int width_mmx = ((width >> 1) << 1) ;
1677
movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
1678
movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
1679
punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
1680
punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
1684
movq [edi + 24], mm1
1693
sptr -= (width_mmx*4 - 4); // sign fixed
1694
dp -= (width_mmx*16 - 4); // sign fixed
1695
for (i = width; i; i--)
1700
png_memcpy(v, sptr, 4);
1701
for (j = 0; j < png_pass_inc[pass]; j++)
1704
png_memcpy(dp, v, 4);
1708
else if (width) // pass == 4 or 5
1710
int width_mmx = ((width >> 1) << 1) ;
1722
movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
1723
movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
1724
punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
1725
punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
1736
sptr -= (width_mmx*4 - 4); // sign fixed
1737
dp -= (width_mmx*8 - 4); // sign fixed
1738
for (i = width; i; i--)
1743
png_memcpy(v, sptr, 4);
1744
for (j = 0; j < png_pass_inc[pass]; j++)
1747
png_memcpy(dp, v, 4);
1752
} /* end of pixel_bytes == 4 */
1754
else if (pixel_bytes == 6)
1756
for (i = width; i; i--)
1760
png_memcpy(v, sptr, 6);
1761
for (j = 0; j < png_pass_inc[pass]; j++)
1763
png_memcpy(dp, v, 6);
1768
} /* end of pixel_bytes == 6 */
1772
for (i = width; i; i--)
1776
png_memcpy(v, sptr, pixel_bytes);
1777
for (j = 0; j < png_pass_inc[pass]; j++)
1779
png_memcpy(dp, v, pixel_bytes);
1785
} /* end of mmx_supported */
1787
else /* MMX not supported: use modified C code - takes advantage
1788
* of inlining of memcpy for a constant */
1790
if (pixel_bytes == 1)
1792
for (i = width; i; i--)
1795
for (j = 0; j < png_pass_inc[pass]; j++)
1800
else if (pixel_bytes == 3)
1802
for (i = width; i; i--)
1806
png_memcpy(v, sptr, pixel_bytes);
1807
for (j = 0; j < png_pass_inc[pass]; j++)
1809
png_memcpy(dp, v, pixel_bytes);
1812
sptr -= pixel_bytes;
1815
else if (pixel_bytes == 2)
1817
for (i = width; i; i--)
1821
png_memcpy(v, sptr, pixel_bytes);
1822
for (j = 0; j < png_pass_inc[pass]; j++)
1824
png_memcpy(dp, v, pixel_bytes);
1827
sptr -= pixel_bytes;
1830
else if (pixel_bytes == 4)
1832
for (i = width; i; i--)
1836
png_memcpy(v, sptr, pixel_bytes);
1837
for (j = 0; j < png_pass_inc[pass]; j++)
1839
png_memcpy(dp, v, pixel_bytes);
1842
sptr -= pixel_bytes;
1845
else if (pixel_bytes == 6)
1847
for (i = width; i; i--)
1851
png_memcpy(v, sptr, pixel_bytes);
1852
for (j = 0; j < png_pass_inc[pass]; j++)
1854
png_memcpy(dp, v, pixel_bytes);
1857
sptr -= pixel_bytes;
1862
for (i = width; i; i--)
1866
png_memcpy(v, sptr, pixel_bytes);
1867
for (j = 0; j < png_pass_inc[pass]; j++)
1869
png_memcpy(dp, v, pixel_bytes);
1872
sptr -= pixel_bytes;
1876
} /* end of MMX not supported */
1879
} /* end switch (row_info->pixel_depth) */
1881
row_info->width = final_width;
1882
row_info->rowbytes = ((final_width *
1883
(png_uint_32)row_info->pixel_depth + 7) >> 3);
1886
#ifdef DISABLE_PNGVCRD_INTERLACE
1887
mmx_supported = save_mmx_supported;
1891
#endif /* PNG_READ_INTERLACING_SUPPORTED */
1894
// These variables are utilized in the functions below. They are declared
1895
// globally here to ensure alignment on 8-byte boundaries.
1900
} LBCarryMask = {0x0101010101010101},
1901
HBClearMask = {0x7f7f7f7f7f7f7f7f},
1902
ActiveMask, ActiveMask2, ActiveMaskEnd, ShiftBpp, ShiftRem;
1905
// Optimized code for PNG Average filter decoder
1907
png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row
1908
, png_bytep prev_row)
1911
png_uint_32 FullLength;
1912
png_uint_32 MMXLength;
1916
bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
1917
FullLength = row_info->rowbytes; // # of bytes to filter
1919
// Init address pointers and offset
1920
mov edi, row // edi ==> Avg(x)
1921
xor ebx, ebx // ebx ==> x
1923
mov esi, prev_row // esi ==> Prior(x)
1924
sub edx, bpp // edx ==> Raw(x-bpp)
1927
// Compute the Raw value for the first bpp bytes
1928
// Raw(x) = Avg(x) + (Prior(x)/2)
1930
mov al, [esi + ebx] // Load al with Prior(x)
1932
shr al, 1 // divide by 2
1933
add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
1935
mov [edi+ebx-1], al // Write back Raw(x);
1936
// mov does not affect flags; -1 to offset inc ebx
1938
// get # of bytes to alignment
1939
mov diff, edi // take start of row
1940
add diff, ebx // add bpp
1941
add diff, 0xf // add 7 + 8 to incr past alignment boundary
1942
and diff, 0xfffffff8 // mask to alignment boundary
1943
sub diff, edi // subtract from start ==> value ebx at alignment
1946
// Compute the Raw value for the bytes upto the alignment boundary
1947
// Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
1951
mov cl, [esi + ebx] // load cl with Prior(x)
1952
mov al, [edx + ebx] // load al with Raw(x-bpp)
1955
shr ax, 1 // divide by 2
1956
add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
1957
cmp ebx, diff // Check if at alignment boundary
1958
mov [edi+ebx-1], al // Write back Raw(x);
1959
// mov does not affect flags; -1 to offset inc ebx
1960
jb davglp1 // Repeat until at alignment boundary
1964
sub eax, ebx // subtract alignment fix
1965
and eax, 0x00000007 // calc bytes over mult of 8
1966
sub ecx, eax // drop over bytes from original length
1969
// Now do the math for the rest of the row
1974
ActiveMask.use = 0x0000000000ffffff;
1975
ShiftBpp.use = 24; // == 3 * 8
1976
ShiftRem.use = 40; // == 64 - 24
1978
// Re-init address pointers and offset
1979
movq mm7, ActiveMask
1980
mov ebx, diff // ebx ==> x = offset to alignment boundary
1981
movq mm5, LBCarryMask
1982
mov edi, row // edi ==> Avg(x)
1983
movq mm4, HBClearMask
1984
mov esi, prev_row // esi ==> Prior(x)
1985
// PRIME the pump (load the first Raw(x-bpp) data set
1986
movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
1987
// (we correct position in loop below)
1989
movq mm0, [edi + ebx] // Load mm0 with Avg(x)
1990
// Add (Prev_row/2) to Average
1992
psrlq mm2, ShiftRem // Correct position Raw(x-bpp) data
1993
movq mm1, [esi + ebx] // Load mm1 with Prior(x)
1995
pand mm3, mm1 // get lsb for each prev_row byte
1996
psrlq mm1, 1 // divide prev_row bytes by 2
1997
pand mm1, mm4 // clear invalid bit 7 of each byte
1998
paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
1999
// Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
2000
movq mm1, mm3 // now use mm1 for getting LBCarrys
2001
pand mm1, mm2 // get LBCarrys for each byte where both
2002
// lsb's were == 1 (Only valid for active group)
2003
psrlq mm2, 1 // divide raw bytes by 2
2004
pand mm2, mm4 // clear invalid bit 7 of each byte
2005
paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2006
pand mm2, mm6 // Leave only Active Group 1 bytes to add to Avg
2007
paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2009
// Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
2010
psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 3-5
2011
movq mm2, mm0 // mov updated Raws to mm2
2012
psllq mm2, ShiftBpp // shift data to position correctly
2013
movq mm1, mm3 // now use mm1 for getting LBCarrys
2014
pand mm1, mm2 // get LBCarrys for each byte where both
2015
// lsb's were == 1 (Only valid for active group)
2016
psrlq mm2, 1 // divide raw bytes by 2
2017
pand mm2, mm4 // clear invalid bit 7 of each byte
2018
paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2019
pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2020
paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2023
// Add 3rd active group (Raw(x-bpp)/2) to Average with LBCarry
2024
psllq mm6, ShiftBpp // shift the mm6 mask to cover the last two
2026
movq mm2, mm0 // mov updated Raws to mm2
2027
psllq mm2, ShiftBpp // shift data to position correctly
2028
// Data only needs to be shifted once here to
2029
// get the correct x-bpp offset.
2030
movq mm1, mm3 // now use mm1 for getting LBCarrys
2031
pand mm1, mm2 // get LBCarrys for each byte where both
2032
// lsb's were == 1 (Only valid for active group)
2033
psrlq mm2, 1 // divide raw bytes by 2
2034
pand mm2, mm4 // clear invalid bit 7 of each byte
2035
paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2036
pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2038
paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2041
// Now ready to write back to memory
2042
movq [edi + ebx - 8], mm0
2043
// Move updated Raw(x) to use as Raw(x-bpp) for next loop
2045
movq mm2, mm0 // mov updated Raw(x) to mm2
2056
ActiveMask.use = 0xffffffffffffffff; // use shift below to clear
2057
// appropriate inactive bytes
2058
ShiftBpp.use = bpp << 3;
2059
ShiftRem.use = 64 - ShiftBpp.use;
2061
movq mm4, HBClearMask
2062
// Re-init address pointers and offset
2063
mov ebx, diff // ebx ==> x = offset to alignment boundary
2064
// Load ActiveMask and clear all bytes except for 1st active group
2065
movq mm7, ActiveMask
2066
mov edi, row // edi ==> Avg(x)
2068
mov esi, prev_row // esi ==> Prior(x)
2070
movq mm5, LBCarryMask
2071
psllq mm6, ShiftBpp // Create mask for 2nd active group
2072
// PRIME the pump (load the first Raw(x-bpp) data set
2073
movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
2074
// (we correct position in loop below)
2076
movq mm0, [edi + ebx]
2077
psrlq mm2, ShiftRem // shift data to position correctly
2078
movq mm1, [esi + ebx]
2079
// Add (Prev_row/2) to Average
2081
pand mm3, mm1 // get lsb for each prev_row byte
2082
psrlq mm1, 1 // divide prev_row bytes by 2
2083
pand mm1, mm4 // clear invalid bit 7 of each byte
2084
paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2085
// Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
2086
movq mm1, mm3 // now use mm1 for getting LBCarrys
2087
pand mm1, mm2 // get LBCarrys for each byte where both
2088
// lsb's were == 1 (Only valid for active group)
2089
psrlq mm2, 1 // divide raw bytes by 2
2090
pand mm2, mm4 // clear invalid bit 7 of each byte
2091
paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2092
pand mm2, mm7 // Leave only Active Group 1 bytes to add to Avg
2093
paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2095
// Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
2096
movq mm2, mm0 // mov updated Raws to mm2
2097
psllq mm2, ShiftBpp // shift data to position correctly
2099
movq mm1, mm3 // now use mm1 for getting LBCarrys
2100
pand mm1, mm2 // get LBCarrys for each byte where both
2101
// lsb's were == 1 (Only valid for active group)
2102
psrlq mm2, 1 // divide raw bytes by 2
2103
pand mm2, mm4 // clear invalid bit 7 of each byte
2104
paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2105
pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2106
paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2109
// Now ready to write back to memory
2110
movq [edi + ebx - 8], mm0
2111
// Prep Raw(x-bpp) for next loop
2112
movq mm2, mm0 // mov updated Raws to mm2
2119
ActiveMask.use = 0x000000000000ffff;
2120
ShiftBpp.use = 24; // == 3 * 8
2121
ShiftRem.use = 40; // == 64 - 24
2124
movq mm7, ActiveMask
2125
// Re-init address pointers and offset
2126
mov ebx, diff // ebx ==> x = offset to alignment boundary
2127
movq mm5, LBCarryMask
2128
mov edi, row // edi ==> Avg(x)
2129
movq mm4, HBClearMask
2130
mov esi, prev_row // esi ==> Prior(x)
2131
// PRIME the pump (load the first Raw(x-bpp) data set
2132
movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
2133
// (we correct position in loop below)
2135
movq mm0, [edi + ebx]
2136
psllq mm2, ShiftRem // shift data to position correctly
2137
movq mm1, [esi + ebx]
2138
// Add (Prev_row/2) to Average
2140
pand mm3, mm1 // get lsb for each prev_row byte
2141
psrlq mm1, 1 // divide prev_row bytes by 2
2142
pand mm1, mm4 // clear invalid bit 7 of each byte
2144
paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2145
// Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
2146
movq mm1, mm3 // now use mm1 for getting LBCarrys
2147
pand mm1, mm2 // get LBCarrys for each byte where both
2148
// lsb's were == 1 (Only valid for active group)
2149
psrlq mm2, 1 // divide raw bytes by 2
2150
pand mm2, mm4 // clear invalid bit 7 of each byte
2151
paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2152
pand mm2, mm6 // Leave only Active Group 1 bytes to add to Avg
2153
paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2154
// Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
2155
psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 2 & 3
2156
movq mm2, mm0 // mov updated Raws to mm2
2157
psllq mm2, ShiftBpp // shift data to position correctly
2158
movq mm1, mm3 // now use mm1 for getting LBCarrys
2159
pand mm1, mm2 // get LBCarrys for each byte where both
2160
// lsb's were == 1 (Only valid for active group)
2161
psrlq mm2, 1 // divide raw bytes by 2
2162
pand mm2, mm4 // clear invalid bit 7 of each byte
2163
paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2164
pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2165
paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2167
// Add rdd active group (Raw(x-bpp)/2) to Average with LBCarry
2168
psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 4 & 5
2169
movq mm2, mm0 // mov updated Raws to mm2
2170
psllq mm2, ShiftBpp // shift data to position correctly
2171
// Data only needs to be shifted once here to
2172
// get the correct x-bpp offset.
2173
movq mm1, mm3 // now use mm1 for getting LBCarrys
2174
pand mm1, mm2 // get LBCarrys for each byte where both
2175
// lsb's were == 1 (Only valid for active group)
2176
psrlq mm2, 1 // divide raw bytes by 2
2177
pand mm2, mm4 // clear invalid bit 7 of each byte
2178
paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2179
pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2180
paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2182
// Add 4th active group (Raw(x-bpp)/2) to Average with LBCarry
2183
psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 6 & 7
2184
movq mm2, mm0 // mov updated Raws to mm2
2185
psllq mm2, ShiftBpp // shift data to position correctly
2186
// Data only needs to be shifted once here to
2187
// get the correct x-bpp offset.
2189
movq mm1, mm3 // now use mm1 for getting LBCarrys
2190
pand mm1, mm2 // get LBCarrys for each byte where both
2191
// lsb's were == 1 (Only valid for active group)
2192
psrlq mm2, 1 // divide raw bytes by 2
2193
pand mm2, mm4 // clear invalid bit 7 of each byte
2194
paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2195
pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2196
paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2199
// Now ready to write back to memory
2200
movq [edi + ebx - 8], mm0
2201
// Prep Raw(x-bpp) for next loop
2202
movq mm2, mm0 // mov updated Raws to mm2
2211
// Re-init address pointers and offset
2212
mov ebx, diff // ebx ==> x = offset to alignment boundary
2213
mov edi, row // edi ==> Avg(x)
2214
cmp ebx, FullLength // Test if offset at end of array
2216
// Do Paeth decode for remaining bytes
2217
mov esi, prev_row // esi ==> Prior(x)
2219
xor ecx, ecx // zero ecx before using cl & cx in loop below
2220
sub edx, bpp // edx ==> Raw(x-bpp)
2222
// Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2224
mov cl, [esi + ebx] // load cl with Prior(x)
2225
mov al, [edx + ebx] // load al with Raw(x-bpp)
2228
shr ax, 1 // divide by 2
2229
add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
2230
cmp ebx, FullLength // Check if at end of array
2231
mov [edi+ebx-1], al // Write back Raw(x);
2232
// mov does not affect flags; -1 to offset inc ebx
2242
// Re-init address pointers and offset
2243
mov ebx, diff // ebx ==> x = offset to alignment boundary
2244
movq mm5, LBCarryMask
2245
mov edi, row // edi ==> Avg(x)
2246
movq mm4, HBClearMask
2247
mov esi, prev_row // esi ==> Prior(x)
2248
// PRIME the pump (load the first Raw(x-bpp) data set
2249
movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
2250
// (NO NEED to correct position in loop below)
2252
movq mm0, [edi + ebx]
2254
movq mm1, [esi + ebx]
2256
pand mm3, mm1 // get lsb for each prev_row byte
2257
psrlq mm1, 1 // divide prev_row bytes by 2
2258
pand mm3, mm2 // get LBCarrys for each byte where both
2260
psrlq mm2, 1 // divide raw bytes by 2
2261
pand mm1, mm4 // clear invalid bit 7 of each byte
2262
paddb mm0, mm3 // add LBCarrys to Avg for each byte
2263
pand mm2, mm4 // clear invalid bit 7 of each byte
2264
paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2265
paddb mm0, mm2 // add (Raw/2) to Avg for each byte
2267
movq [edi + ebx - 8], mm0
2268
movq mm2, mm0 // reuse as Raw(x-bpp)
2273
default: // bpp greater than 8
2276
movq mm5, LBCarryMask
2277
// Re-init address pointers and offset
2278
mov ebx, diff // ebx ==> x = offset to alignment boundary
2279
mov edi, row // edi ==> Avg(x)
2280
movq mm4, HBClearMask
2282
mov esi, prev_row // esi ==> Prior(x)
2283
sub edx, bpp // edx ==> Raw(x-bpp)
2285
movq mm0, [edi + ebx]
2287
movq mm1, [esi + ebx]
2288
pand mm3, mm1 // get lsb for each prev_row byte
2289
movq mm2, [edx + ebx]
2290
psrlq mm1, 1 // divide prev_row bytes by 2
2291
pand mm3, mm2 // get LBCarrys for each byte where both
2293
psrlq mm2, 1 // divide raw bytes by 2
2294
pand mm1, mm4 // clear invalid bit 7 of each byte
2295
paddb mm0, mm3 // add LBCarrys to Avg for each byte
2296
pand mm2, mm4 // clear invalid bit 7 of each byte
2297
paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2299
paddb mm0, mm2 // add (Raw/2) to Avg for each byte
2301
movq [edi + ebx - 8], mm0
2306
} // end switch ( bpp )
2309
// MMX acceleration complete now do clean-up
2310
// Check if any remaining bytes left to decode
2311
mov ebx, MMXLength // ebx ==> x = offset bytes remaining after MMX
2312
mov edi, row // edi ==> Avg(x)
2313
cmp ebx, FullLength // Test if offset at end of array
2315
// Do Paeth decode for remaining bytes
2316
mov esi, prev_row // esi ==> Prior(x)
2318
xor ecx, ecx // zero ecx before using cl & cx in loop below
2319
sub edx, bpp // edx ==> Raw(x-bpp)
2321
// Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2323
mov cl, [esi + ebx] // load cl with Prior(x)
2324
mov al, [edx + ebx] // load al with Raw(x-bpp)
2327
shr ax, 1 // divide by 2
2328
add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
2329
cmp ebx, FullLength // Check if at end of array
2330
mov [edi+ebx-1], al // Write back Raw(x);
2331
// mov does not affect flags; -1 to offset inc ebx
2334
emms // End MMX instructions; prep for possible FP instrs.
2338
// Optimized code for PNG Paeth filter decoder
2340
png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
2343
png_uint_32 FullLength;
2344
png_uint_32 MMXLength;
2349
int patemp, pbtemp, pctemp;
2351
bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
2352
FullLength = row_info->rowbytes; // # of bytes to filter
2355
xor ebx, ebx // ebx ==> x offset
2357
xor edx, edx // edx ==> x-bpp offset
2361
// Compute the Raw value for the first bpp bytes
2362
// Note: the formula works out to be always
2363
// Paeth(x) = Raw(x) + Prior(x) where x < bpp
2369
mov [edi + ebx - 1], al
2371
// get # of bytes to alignment
2372
mov diff, edi // take start of row
2373
add diff, ebx // add bpp
2375
add diff, 0xf // add 7 + 8 to incr past alignment boundary
2376
and diff, 0xfffffff8 // mask to alignment boundary
2377
sub diff, edi // subtract from start ==> value ebx at alignment
2382
// pav = p - a = (a + b - c) - a = b - c
2383
mov al, [esi + ebx] // load Prior(x) into al
2384
mov cl, [esi + edx] // load Prior(x-bpp) into cl
2385
sub eax, ecx // subtract Prior(x-bpp)
2386
mov patemp, eax // Save pav for later use
2388
// pbv = p - b = (a + b - c) - b = a - c
2389
mov al, [edi + edx] // load Raw(x-bpp) into al
2390
sub eax, ecx // subtract Prior(x-bpp)
2392
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2393
add eax, patemp // pcv = pav + pbv
2395
test eax, 0x80000000
2397
neg eax // reverse sign of neg values
2399
mov pctemp, eax // save pc for later use
2401
test ecx, 0x80000000
2403
neg ecx // reverse sign of neg values
2405
mov pbtemp, ecx // save pb for later use
2408
test eax, 0x80000000
2410
neg eax // reverse sign of neg values
2412
mov patemp, eax // save pa for later use
2416
// pa > pb; now test if pb <= pc
2419
// pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
2420
mov cl, [esi + edx] // load Prior(x-bpp) into cl
2423
// pb <= pc; Raw(x) = Paeth(x) + Prior(x)
2424
mov cl, [esi + ebx] // load Prior(x) into cl
2427
// pa <= pb; now test if pa <= pc
2430
// pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
2431
mov cl, [esi + edx] // load Prior(x-bpp) into cl
2434
// pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
2435
mov cl, [edi + edx] // load Raw(x-bpp) into cl
2439
// Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
2440
add [edi + ebx - 1], cl
2446
sub eax, ebx // subtract alignment fix
2447
and eax, 0x00000007 // calc bytes over mult of 8
2448
sub ecx, eax // drop over bytes from original length
2451
// Now do the math for the rest of the row
2456
ActiveMask.use = 0x0000000000ffffff;
2457
ActiveMaskEnd.use = 0xffff000000000000;
2458
ShiftBpp.use = 24; // == bpp(3) * 8
2459
ShiftRem.use = 40; // == 64 - 24
2466
// PRIME the pump (load the first Raw(x-bpp) data set
2467
movq mm1, [edi+ebx-8]
2469
psrlq mm1, ShiftRem // shift last 3 bytes to 1st 3 bytes
2470
movq mm2, [esi + ebx] // load b=Prior(x)
2471
punpcklbw mm1, mm0 // Unpack High bytes of a
2472
movq mm3, [esi+ebx-8] // Prep c=Prior(x-bpp) bytes
2473
punpcklbw mm2, mm0 // Unpack High bytes of b
2474
psrlq mm3, ShiftRem // shift last 3 bytes to 1st 3 bytes
2475
// pav = p - a = (a + b - c) - a = b - c
2477
punpcklbw mm3, mm0 // Unpack High bytes of c
2478
// pbv = p - b = (a + b - c) - b = a - c
2482
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2486
// pa = abs(p-a) = abs(pav)
2487
// pb = abs(p-b) = abs(pbv)
2488
// pc = abs(p-c) = abs(pcv)
2489
pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2491
pand mm0, mm4 // Only pav bytes < 0 in mm7
2492
pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2494
pand mm7, mm5 // Only pbv bytes < 0 in mm0
2498
pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2499
pand mm0, mm6 // Only pav bytes < 0 in mm7
2505
pcmpgtw mm7, mm5 // pa > pb?
2507
// use mm7 mask to merge pa & pb
2509
// use mm0 mask copy to merge a & b
2515
// test ((pa <= pb)? pa:pb) <= pc
2516
pcmpgtw mm7, mm6 // pab > pc?
2523
movq mm3, [esi + ebx] // load c=Prior(x-bpp)
2524
pand mm7, ActiveMask
2525
movq mm2, mm3 // load b=Prior(x) step 1
2526
paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2527
punpcklbw mm3, mm0 // Unpack High bytes of c
2528
movq [edi + ebx], mm7 // write back updated value
2529
movq mm1, mm7 // Now mm1 will be used as Raw(x-bpp)
2530
// Now do Paeth for 2nd set of bytes (3-5)
2531
psrlq mm2, ShiftBpp // load b=Prior(x) step 2
2532
punpcklbw mm1, mm0 // Unpack High bytes of a
2534
punpcklbw mm2, mm0 // Unpack High bytes of b
2535
// pbv = p - b = (a + b - c) - b = a - c
2537
// pav = p - a = (a + b - c) - a = b - c
2541
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
2542
// pav + pbv = pbv + pav
2546
// pa = abs(p-a) = abs(pav)
2547
// pb = abs(p-b) = abs(pbv)
2548
// pc = abs(p-c) = abs(pcv)
2549
pcmpgtw mm0, mm5 // Create mask pbv bytes < 0
2550
pcmpgtw mm7, mm4 // Create mask pav bytes < 0
2551
pand mm0, mm5 // Only pbv bytes < 0 in mm0
2552
pand mm7, mm4 // Only pav bytes < 0 in mm7
2558
pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2559
pand mm0, mm6 // Only pav bytes < 0 in mm7
2564
pcmpgtw mm7, mm5 // pa > pb?
2566
// use mm7 mask to merge pa & pb
2568
// use mm0 mask copy to merge a & b
2574
// test ((pa <= pb)? pa:pb) <= pc
2575
pcmpgtw mm7, mm6 // pab > pc?
2576
movq mm2, [esi + ebx] // load b=Prior(x)
2583
movq mm3, mm2 // load c=Prior(x-bpp) step 1
2584
pand mm7, ActiveMask
2585
punpckhbw mm2, mm0 // Unpack High bytes of b
2586
psllq mm7, ShiftBpp // Shift bytes to 2nd group of 3 bytes
2587
// pav = p - a = (a + b - c) - a = b - c
2589
paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2590
psllq mm3, ShiftBpp // load c=Prior(x-bpp) step 2
2591
movq [edi + ebx], mm7 // write back updated value
2593
punpckhbw mm3, mm0 // Unpack High bytes of c
2594
psllq mm1, ShiftBpp // Shift bytes
2595
// Now mm1 will be used as Raw(x-bpp)
2596
// Now do Paeth for 3rd, and final, set of bytes (6-7)
2598
punpckhbw mm1, mm0 // Unpack High bytes of a
2600
// pbv = p - b = (a + b - c) - b = a - c
2602
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2608
// pa = abs(p-a) = abs(pav)
2609
// pb = abs(p-b) = abs(pbv)
2610
// pc = abs(p-c) = abs(pcv)
2611
pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2612
pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2613
pand mm0, mm4 // Only pav bytes < 0 in mm7
2614
pand mm7, mm5 // Only pbv bytes < 0 in mm0
2620
pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2621
pand mm0, mm6 // Only pav bytes < 0 in mm7
2626
pcmpgtw mm7, mm5 // pa > pb?
2628
// use mm0 mask copy to merge a & b
2630
// use mm7 mask to merge pa & pb
2636
// test ((pa <= pb)? pa:pb) <= pc
2637
pcmpgtw mm7, mm6 // pab > pc?
2643
// Step ebx to next set of 8 bytes and repeat loop til done
2645
pand mm1, ActiveMaskEnd
2646
paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
2649
pxor mm0, mm0 // pxor does not affect flags
2650
movq [edi + ebx - 8], mm1 // write back updated value
2651
// mm1 will be used as Raw(x-bpp) next loop
2652
// mm3 ready to be used as Prior(x-bpp) next loop
2662
ActiveMask.use = 0x00000000ffffffff;
2663
ActiveMask2.use = 0xffffffff00000000;
2664
ShiftBpp.use = bpp << 3; // == bpp * 8
2665
ShiftRem.use = 64 - ShiftBpp.use;
2671
// PRIME the pump (load the first Raw(x-bpp) data set
2672
movq mm1, [edi+ebx-8]
2675
// Must shift to position Raw(x-bpp) data
2677
// Do first set of 4 bytes
2678
movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
2679
punpcklbw mm1, mm0 // Unpack Low bytes of a
2680
movq mm2, [esi + ebx] // load b=Prior(x)
2681
punpcklbw mm2, mm0 // Unpack Low bytes of b
2682
// Must shift to position Prior(x-bpp) data
2684
// pav = p - a = (a + b - c) - a = b - c
2686
punpcklbw mm3, mm0 // Unpack Low bytes of c
2687
// pbv = p - b = (a + b - c) - b = a - c
2691
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2694
// pa = abs(p-a) = abs(pav)
2695
// pb = abs(p-b) = abs(pbv)
2696
// pc = abs(p-c) = abs(pcv)
2697
pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2699
pand mm0, mm4 // Only pav bytes < 0 in mm7
2700
pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2702
pand mm7, mm5 // Only pbv bytes < 0 in mm0
2706
pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2707
pand mm0, mm6 // Only pav bytes < 0 in mm7
2713
pcmpgtw mm7, mm5 // pa > pb?
2715
// use mm7 mask to merge pa & pb
2717
// use mm0 mask copy to merge a & b
2723
// test ((pa <= pb)? pa:pb) <= pc
2724
pcmpgtw mm7, mm6 // pab > pc?
2731
movq mm3, [esi + ebx - 8] // load c=Prior(x-bpp)
2732
pand mm7, ActiveMask
2734
movq mm2, [esi + ebx] // load b=Prior(x) step 1
2735
paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2737
movq [edi + ebx], mm7 // write back updated value
2738
movq mm1, [edi+ebx-8]
2744
punpckhbw mm3, mm0 // Unpack High bytes of c
2746
// Do second set of 4 bytes
2747
punpckhbw mm2, mm0 // Unpack High bytes of b
2748
punpckhbw mm1, mm0 // Unpack High bytes of a
2749
// pav = p - a = (a + b - c) - a = b - c
2751
// pbv = p - b = (a + b - c) - b = a - c
2755
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2758
// pa = abs(p-a) = abs(pav)
2759
// pb = abs(p-b) = abs(pbv)
2760
// pc = abs(p-c) = abs(pcv)
2761
pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2763
pand mm0, mm4 // Only pav bytes < 0 in mm7
2764
pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2766
pand mm7, mm5 // Only pbv bytes < 0 in mm0
2770
pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2771
pand mm0, mm6 // Only pav bytes < 0 in mm7
2777
pcmpgtw mm7, mm5 // pa > pb?
2779
// use mm7 mask to merge pa & pb
2781
// use mm0 mask copy to merge a & b
2787
// test ((pa <= pb)? pa:pb) <= pc
2788
pcmpgtw mm7, mm6 // pab > pc?
2795
// Step ex to next set of 8 bytes and repeat loop til done
2798
paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
2800
movq [edi + ebx - 8], mm1 // write back updated value
2801
// mm1 will be used as Raw(x-bpp) next loop
2809
ActiveMask.use = 0x00000000ffffffff;
2815
// PRIME the pump (load the first Raw(x-bpp) data set
2816
movq mm1, [edi+ebx-8] // Only time should need to read
2817
// a=Raw(x-bpp) bytes
2819
// Do first set of 4 bytes
2820
movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
2821
punpckhbw mm1, mm0 // Unpack Low bytes of a
2822
movq mm2, [esi + ebx] // load b=Prior(x)
2823
punpcklbw mm2, mm0 // Unpack High bytes of b
2824
// pav = p - a = (a + b - c) - a = b - c
2826
punpckhbw mm3, mm0 // Unpack High bytes of c
2827
// pbv = p - b = (a + b - c) - b = a - c
2831
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2834
// pa = abs(p-a) = abs(pav)
2835
// pb = abs(p-b) = abs(pbv)
2836
// pc = abs(p-c) = abs(pcv)
2837
pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2839
pand mm0, mm4 // Only pav bytes < 0 in mm7
2840
pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2842
pand mm7, mm5 // Only pbv bytes < 0 in mm0
2846
pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2847
pand mm0, mm6 // Only pav bytes < 0 in mm7
2853
pcmpgtw mm7, mm5 // pa > pb?
2855
// use mm7 mask to merge pa & pb
2857
// use mm0 mask copy to merge a & b
2863
// test ((pa <= pb)? pa:pb) <= pc
2864
pcmpgtw mm7, mm6 // pab > pc?
2871
movq mm3, [esi + ebx] // load c=Prior(x-bpp)
2872
pand mm7, ActiveMask
2873
movq mm2, mm3 // load b=Prior(x) step 1
2874
paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2875
punpcklbw mm3, mm0 // Unpack High bytes of c
2876
movq [edi + ebx], mm7 // write back updated value
2877
movq mm1, mm7 // Now mm1 will be used as Raw(x-bpp)
2878
// Do second set of 4 bytes
2879
punpckhbw mm2, mm0 // Unpack Low bytes of b
2880
punpcklbw mm1, mm0 // Unpack Low bytes of a
2881
// pav = p - a = (a + b - c) - a = b - c
2883
// pbv = p - b = (a + b - c) - b = a - c
2887
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2890
// pa = abs(p-a) = abs(pav)
2891
// pb = abs(p-b) = abs(pbv)
2892
// pc = abs(p-c) = abs(pcv)
2893
pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2895
pand mm0, mm4 // Only pav bytes < 0 in mm7
2896
pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2898
pand mm7, mm5 // Only pbv bytes < 0 in mm0
2902
pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2903
pand mm0, mm6 // Only pav bytes < 0 in mm7
2909
pcmpgtw mm7, mm5 // pa > pb?
2911
// use mm7 mask to merge pa & pb
2913
// use mm0 mask copy to merge a & b
2919
// test ((pa <= pb)? pa:pb) <= pc
2920
pcmpgtw mm7, mm6 // pab > pc?
2927
// Step ex to next set of 8 bytes and repeat loop til done
2930
paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
2932
movq [edi + ebx - 8], mm1 // write back updated value
2933
// mm1 will be used as Raw(x-bpp) next loop
2940
ActiveMask.use = 0x00000000ffffffff;
2946
// PRIME the pump (load the first Raw(x-bpp) data set
2947
movq mm1, [edi+ebx-8] // Only time should need to read
2948
// a=Raw(x-bpp) bytes
2950
// Do first set of 4 bytes
2951
movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
2952
punpcklbw mm1, mm0 // Unpack Low bytes of a
2953
movq mm2, [esi + ebx] // load b=Prior(x)
2954
punpcklbw mm2, mm0 // Unpack Low bytes of b
2955
// pav = p - a = (a + b - c) - a = b - c
2957
punpcklbw mm3, mm0 // Unpack Low bytes of c
2958
// pbv = p - b = (a + b - c) - b = a - c
2962
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2965
// pa = abs(p-a) = abs(pav)
2966
// pb = abs(p-b) = abs(pbv)
2967
// pc = abs(p-c) = abs(pcv)
2968
pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2970
pand mm0, mm4 // Only pav bytes < 0 in mm7
2971
pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2973
pand mm7, mm5 // Only pbv bytes < 0 in mm0
2977
pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2978
pand mm0, mm6 // Only pav bytes < 0 in mm7
2984
pcmpgtw mm7, mm5 // pa > pb?
2986
// use mm7 mask to merge pa & pb
2988
// use mm0 mask copy to merge a & b
2994
// test ((pa <= pb)? pa:pb) <= pc
2995
pcmpgtw mm7, mm6 // pab > pc?
3002
movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
3003
pand mm7, ActiveMask
3004
movq mm2, [esi + ebx] // load b=Prior(x)
3005
paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
3006
punpckhbw mm3, mm0 // Unpack High bytes of c
3007
movq [edi + ebx], mm7 // write back updated value
3008
movq mm1, [edi+ebx-8] // read a=Raw(x-bpp) bytes
3010
// Do second set of 4 bytes
3011
punpckhbw mm2, mm0 // Unpack High bytes of b
3012
punpckhbw mm1, mm0 // Unpack High bytes of a
3013
// pav = p - a = (a + b - c) - a = b - c
3015
// pbv = p - b = (a + b - c) - b = a - c
3019
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3022
// pa = abs(p-a) = abs(pav)
3023
// pb = abs(p-b) = abs(pbv)
3024
// pc = abs(p-c) = abs(pcv)
3025
pcmpgtw mm0, mm4 // Create mask pav bytes < 0
3027
pand mm0, mm4 // Only pav bytes < 0 in mm7
3028
pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
3030
pand mm7, mm5 // Only pbv bytes < 0 in mm0
3034
pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
3035
pand mm0, mm6 // Only pav bytes < 0 in mm7
3041
pcmpgtw mm7, mm5 // pa > pb?
3043
// use mm7 mask to merge pa & pb
3045
// use mm0 mask copy to merge a & b
3051
// test ((pa <= pb)? pa:pb) <= pc
3052
pcmpgtw mm7, mm6 // pab > pc?
3059
// Step ex to next set of 8 bytes and repeat loop til done
3062
paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
3064
movq [edi + ebx - 8], mm1 // write back updated value
3065
// mm1 will be used as Raw(x-bpp) next loop
3081
// Do Paeth decode for remaining bytes
3083
xor ecx, ecx // zero ecx before using cl & cx in loop below
3084
sub edx, bpp // Set edx = ebx - bpp
3087
// pav = p - a = (a + b - c) - a = b - c
3088
mov al, [esi + ebx] // load Prior(x) into al
3089
mov cl, [esi + edx] // load Prior(x-bpp) into cl
3090
sub eax, ecx // subtract Prior(x-bpp)
3091
mov patemp, eax // Save pav for later use
3093
// pbv = p - b = (a + b - c) - b = a - c
3094
mov al, [edi + edx] // load Raw(x-bpp) into al
3095
sub eax, ecx // subtract Prior(x-bpp)
3097
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3098
add eax, patemp // pcv = pav + pbv
3100
test eax, 0x80000000
3102
neg eax // reverse sign of neg values
3104
mov pctemp, eax // save pc for later use
3106
test ecx, 0x80000000
3108
neg ecx // reverse sign of neg values
3110
mov pbtemp, ecx // save pb for later use
3113
test eax, 0x80000000
3115
neg eax // reverse sign of neg values
3117
mov patemp, eax // save pa for later use
3121
// pa > pb; now test if pb <= pc
3124
// pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3125
mov cl, [esi + edx] // load Prior(x-bpp) into cl
3128
// pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3129
mov cl, [esi + ebx] // load Prior(x) into cl
3132
// pa <= pb; now test if pa <= pc
3135
// pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3136
mov cl, [esi + edx] // load Prior(x-bpp) into cl
3139
// pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3140
mov cl, [edi + edx] // load Raw(x-bpp) into cl
3144
// Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3145
add [edi + ebx - 1], cl
3151
return; // No need to go further with this one
3152
} // end switch ( bpp )
3155
// MMX acceleration complete now do clean-up
3156
// Check if any remaining bytes left to decode
3162
// Do Paeth decode for remaining bytes
3164
xor ecx, ecx // zero ecx before using cl & cx in loop below
3165
sub edx, bpp // Set edx = ebx - bpp
3168
// pav = p - a = (a + b - c) - a = b - c
3169
mov al, [esi + ebx] // load Prior(x) into al
3170
mov cl, [esi + edx] // load Prior(x-bpp) into cl
3171
sub eax, ecx // subtract Prior(x-bpp)
3172
mov patemp, eax // Save pav for later use
3174
// pbv = p - b = (a + b - c) - b = a - c
3175
mov al, [edi + edx] // load Raw(x-bpp) into al
3176
sub eax, ecx // subtract Prior(x-bpp)
3178
// pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3179
add eax, patemp // pcv = pav + pbv
3181
test eax, 0x80000000
3183
neg eax // reverse sign of neg values
3185
mov pctemp, eax // save pc for later use
3187
test ecx, 0x80000000
3189
neg ecx // reverse sign of neg values
3191
mov pbtemp, ecx // save pb for later use
3194
test eax, 0x80000000
3196
neg eax // reverse sign of neg values
3198
mov patemp, eax // save pa for later use
3202
// pa > pb; now test if pb <= pc
3205
// pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3206
mov cl, [esi + edx] // load Prior(x-bpp) into cl
3209
// pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3210
mov cl, [esi + ebx] // load Prior(x) into cl
3213
// pa <= pb; now test if pa <= pc
3216
// pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3217
mov cl, [esi + edx] // load Prior(x-bpp) into cl
3220
// pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3221
mov cl, [edi + edx] // load Raw(x-bpp) into cl
3225
// Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3226
add [edi + ebx - 1], cl
3230
emms // End MMX instructions; prep for possible FP instrs.
3234
// Optimized code for PNG Sub filter decoder
3236
png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
3240
png_uint_32 FullLength;
3241
png_uint_32 MMXLength;
3244
bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
3245
FullLength = row_info->rowbytes - bpp; // # of bytes to filter
3248
mov esi, edi // lp = row
3249
add edi, bpp // rp = row + bpp
3251
// get # of bytes to alignment
3252
mov diff, edi // take start of row
3253
add diff, 0xf // add 7 + 8 to incr past
3254
// alignment boundary
3256
and diff, 0xfffffff8 // mask to alignment boundary
3257
sub diff, edi // subtract from start ==> value
3270
sub edx, ebx // subtract alignment fix
3271
and edx, 0x00000007 // calc bytes over mult of 8
3272
sub ecx, edx // drop over bytes from length
3276
// Now do the math for the rest of the row
3281
ActiveMask.use = 0x0000ffffff000000;
3282
ShiftBpp.use = 24; // == 3 * 8
3283
ShiftRem.use = 40; // == 64 - 24
3286
movq mm7, ActiveMask // Load ActiveMask for 2nd active byte group
3287
mov esi, edi // lp = row
3288
add edi, bpp // rp = row + bpp
3291
psllq mm6, ShiftBpp // Move mask in mm6 to cover 3rd active
3293
// PRIME the pump (load the first Raw(x-bpp) data set
3294
movq mm1, [edi+ebx-8]
3296
psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
3297
// no need for mask; shift clears inactive bytes
3298
// Add 1st active group
3301
// Add 2nd active group
3302
movq mm1, mm0 // mov updated Raws to mm1
3303
psllq mm1, ShiftBpp // shift data to position correctly
3304
pand mm1, mm7 // mask to use only 2nd active group
3306
// Add 3rd active group
3307
movq mm1, mm0 // mov updated Raws to mm1
3308
psllq mm1, ShiftBpp // shift data to position correctly
3309
pand mm1, mm6 // mask to use only 3rd active group
3313
movq [edi+ebx-8], mm0 // Write updated Raws back to array
3314
// Prep for doing 1st add at top of loop
3323
// Placed here just in case this is a duplicate of the
3324
// non-MMX code for the SUB filter in png_read_filter_row above
3329
// bpp = (row_info->pixel_depth + 7) >> 3;
3330
// for (i = (png_uint_32)bpp, rp = row + bpp, lp = row;
3331
// i < row_info->rowbytes; i++, rp++, lp++)
3333
// *rp = (png_byte)(((int)(*rp) + (int)(*lp)) & 0xff);
3340
mov esi, edi // lp = row
3342
add edi, bpp // rp = row + bpp
3359
ShiftBpp.use = bpp << 3;
3360
ShiftRem.use = 64 - ShiftBpp.use;
3364
mov esi, edi // lp = row
3365
add edi, bpp // rp = row + bpp
3366
// PRIME the pump (load the first Raw(x-bpp) data set
3367
movq mm1, [edi+ebx-8]
3369
psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
3370
// no need for mask; shift clears inactive bytes
3373
// Add 2nd active group
3374
movq mm1, mm0 // mov updated Raws to mm1
3375
psllq mm1, ShiftBpp // shift data to position correctly
3376
// there is no need for any mask
3377
// since shift clears inactive bits/bytes
3381
movq [edi+ebx-8], mm0
3382
movq mm1, mm0 // Prep for doing 1st add at top of loop
3390
ActiveMask.use = 0x00000000ffff0000;
3391
ShiftBpp.use = 16; // == 2 * 8
3392
ShiftRem.use = 48; // == 64 - 16
3394
movq mm7, ActiveMask // Load ActiveMask for 2nd active byte group
3398
psllq mm6, ShiftBpp // Move mask in mm6 to cover 3rd active
3400
mov esi, edi // lp = row
3402
add edi, bpp // rp = row + bpp
3403
psllq mm5, ShiftBpp // Move mask in mm5 to cover 4th active
3405
// PRIME the pump (load the first Raw(x-bpp) data set
3406
movq mm1, [edi+ebx-8]
3408
// Add 1st active group
3409
psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
3410
// no need for mask; shift clears inactive
3414
// Add 2nd active group
3415
movq mm1, mm0 // mov updated Raws to mm1
3416
psllq mm1, ShiftBpp // shift data to position correctly
3417
pand mm1, mm7 // mask to use only 2nd active group
3419
// Add 3rd active group
3420
movq mm1, mm0 // mov updated Raws to mm1
3421
psllq mm1, ShiftBpp // shift data to position correctly
3422
pand mm1, mm6 // mask to use only 3rd active group
3424
// Add 4th active group
3425
movq mm1, mm0 // mov updated Raws to mm1
3426
psllq mm1, ShiftBpp // shift data to position correctly
3427
pand mm1, mm5 // mask to use only 4th active group
3431
movq [edi+ebx-8], mm0 // Write updated Raws back to array
3432
movq mm1, mm0 // Prep for doing 1st add at top of loop
3442
mov esi, edi // lp = row
3443
add edi, bpp // rp = row + bpp
3445
movq mm7, [edi+ebx-8] // PRIME the pump (load the first
3446
// Raw(x-bpp) data set
3447
and ecx, 0x0000003f // calc bytes over mult of 64
3449
movq mm0, [edi+ebx] // Load Sub(x) for 1st 8 bytes
3451
movq mm1, [edi+ebx+8] // Load Sub(x) for 2nd 8 bytes
3452
movq [edi+ebx], mm0 // Write Raw(x) for 1st 8 bytes
3453
// Now mm0 will be used as Raw(x-bpp) for
3454
// the 2nd group of 8 bytes. This will be
3455
// repeated for each group of 8 bytes with
3456
// the 8th group being used as the Raw(x-bpp)
3457
// for the 1st group of the next loop.
3459
movq mm2, [edi+ebx+16] // Load Sub(x) for 3rd 8 bytes
3460
movq [edi+ebx+8], mm1 // Write Raw(x) for 2nd 8 bytes
3462
movq mm3, [edi+ebx+24] // Load Sub(x) for 4th 8 bytes
3463
movq [edi+ebx+16], mm2 // Write Raw(x) for 3rd 8 bytes
3465
movq mm4, [edi+ebx+32] // Load Sub(x) for 5th 8 bytes
3466
movq [edi+ebx+24], mm3 // Write Raw(x) for 4th 8 bytes
3468
movq mm5, [edi+ebx+40] // Load Sub(x) for 6th 8 bytes
3469
movq [edi+ebx+32], mm4 // Write Raw(x) for 5th 8 bytes
3471
movq mm6, [edi+ebx+48] // Load Sub(x) for 7th 8 bytes
3472
movq [edi+ebx+40], mm5 // Write Raw(x) for 6th 8 bytes
3474
movq mm7, [edi+ebx+56] // Load Sub(x) for 8th 8 bytes
3475
movq [edi+ebx+48], mm6 // Write Raw(x) for 7th 8 bytes
3479
movq [edi+ebx-8], mm7 // Write Raw(x) for 8th 8 bytes
3488
movq [edi+ebx-8], mm0 // use -8 to offset early add to ebx
3489
movq mm7, mm0 // Move calculated Raw(x) data to mm1 to
3490
// be the new Raw(x-bpp) for the next loop
3497
default: // bpp greater than 8 bytes
3502
mov esi, edi // lp = row
3503
add edi, bpp // rp = row + bpp
3510
movq [edi+ebx-8], mm0 // mov does not affect flags; -8 to offset
3517
} // end switch ( bpp )
3524
mov esi, edi // lp = row
3526
add edi, bpp // rp = row + bpp
3534
emms // End MMX instructions; prep for possible FP instrs.
3538
// Optimized code for PNG Up filter decoder
3540
png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
3544
len = row_info->rowbytes; // # of bytes to filter
3547
// get # of bytes to alignment
3562
mov [edi + ebx-1], al // mov does not affect flags; -1 to offset inc ebx
3567
sub edx, ebx // subtract alignment fix
3568
and edx, 0x0000003f // calc bytes over mult of 64
3569
sub ecx, edx // drop over bytes from length
3570
// Unrolled loop - use all MMX registers and interleave to reduce
3571
// number of branch instructions (loops) and reduce partial stalls
3575
movq mm3, [esi+ebx+8]
3577
movq mm2, [edi+ebx+8]
3580
movq mm5, [esi+ebx+16]
3581
movq [edi+ebx+8], mm2
3582
movq mm4, [edi+ebx+16]
3583
movq mm7, [esi+ebx+24]
3585
movq mm6, [edi+ebx+24]
3586
movq [edi+ebx+16], mm4
3588
movq mm1, [esi+ebx+32]
3589
movq [edi+ebx+24], mm6
3590
movq mm0, [edi+ebx+32]
3591
movq mm3, [esi+ebx+40]
3593
movq mm2, [edi+ebx+40]
3594
movq [edi+ebx+32], mm0
3596
movq mm5, [esi+ebx+48]
3597
movq [edi+ebx+40], mm2
3598
movq mm4, [edi+ebx+48]
3599
movq mm7, [esi+ebx+56]
3601
movq mm6, [edi+ebx+56]
3602
movq [edi+ebx+48], mm4
3606
movq [edi+ebx-8], mm6 // (+56)movq does not affect flags;
3607
// -8 to offset add ebx
3610
cmp edx, 0 // Test for bytes over mult of 64
3614
// 2 lines added by lcreeve@netins.net
3615
// (mail 11 Jul 98 in png-implement list)
3616
cmp edx, 8 //test for less than 8 bytes
3621
and edx, 0x00000007 // calc bytes over mult of 8
3622
sub ecx, edx // drop over bytes from length
3624
// Loop using MMX registers mm0 & mm1 to update 8 bytes simultaneously
3631
movq [edi+ebx-8], mm0 // movq does not affect flags; -8 to offset add ebx
3633
cmp edx, 0 // Test for bytes over mult of 8
3637
add ecx, edx // move over byte count into counter
3638
// Loop using x86 registers to update remaining bytes
3644
mov [edi + ebx-1], al // mov does not affect flags; -1 to offset inc ebx
3647
// Conversion of filtered row completed
3648
emms // End MMX instructions; prep for possible FP instrs.
3653
// Optimized png_read_filter_row routines
3655
png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
3656
row, png_bytep prev_row, int filter)
3663
if (mmx_supported == 2)
3664
mmx_supported = mmxsupport();
3668
png_read_filter_row_c(png_ptr, row_info, row, prev_row, filter);
3673
png_debug(1, "in png_read_filter_row\n");
3675
png_debug1(0,"%s, ", "MMX");
3677
png_debug1(0,"%s, ", "x86");
3681
case 0: sprintf(filnm, "None ");
3683
case 1: sprintf(filnm, "Sub ");
3685
case 2: sprintf(filnm, "Up ");
3687
case 3: sprintf(filnm, "Avg ");
3689
case 4: sprintf(filnm, "Paeth");
3691
default: sprintf(filnm, "Unknw");
3694
png_debug2(0,"row=%5d, %s, ", png_ptr->row_number, filnm);
3695
png_debug2(0, "pd=%2d, b=%d, ", (int)row_info->pixel_depth,
3696
(int)((row_info->pixel_depth + 7) >> 3));
3697
png_debug1(0,"len=%8d, ", row_info->rowbytes);
3702
case PNG_FILTER_VALUE_NONE:
3704
case PNG_FILTER_VALUE_SUB:
3707
if ((row_info->pixel_depth > 8) &&
3708
(row_info->rowbytes >= 128) )
3710
png_read_filter_row_mmx_sub(row_info, row);
3716
png_uint_32 istop = row_info->rowbytes;
3717
png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3718
png_bytep rp = row + bpp;
3721
for (i = bpp; i < istop; i++)
3723
*rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
3729
case PNG_FILTER_VALUE_UP:
3732
if ((row_info->pixel_depth > 8) &&
3733
(row_info->rowbytes >= 128) )
3735
png_read_filter_row_mmx_up(row_info, row, prev_row);
3743
for (i = 0, rp = row, pp = prev_row;
3744
i < row_info->rowbytes; i++, rp++, pp++)
3746
*rp = (png_byte)(((int)(*rp) + (int)(*pp)) & 0xff);
3751
case PNG_FILTER_VALUE_AVG:
3754
if ((row_info->pixel_depth > 8) &&
3755
(row_info->rowbytes >= 128) )
3757
png_read_filter_row_mmx_avg(row_info, row, prev_row);
3764
png_bytep pp = prev_row;
3766
png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3767
png_uint_32 istop = row_info->rowbytes - bpp;
3769
for (i = 0; i < bpp; i++)
3771
*rp = (png_byte)(((int)(*rp) +
3772
((int)(*pp++) >> 1)) & 0xff);
3776
for (i = 0; i < istop; i++)
3778
*rp = (png_byte)(((int)(*rp) +
3779
((int)(*pp++ + *lp++) >> 1)) & 0xff);
3785
case PNG_FILTER_VALUE_PAETH:
3788
if ((row_info->pixel_depth > 8) &&
3789
(row_info->rowbytes >= 128) )
3791
png_read_filter_row_mmx_paeth(row_info, row, prev_row);
3798
png_bytep pp = prev_row;
3800
png_bytep cp = prev_row;
3801
png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3802
png_uint_32 istop=row_info->rowbytes - bpp;
3804
for (i = 0; i < bpp; i++)
3806
*rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
3810
for (i = 0; i < istop; i++) // use leftover rp,pp
3812
int a, b, c, pa, pb, pc, p;
3826
pa = p < 0 ? -p : p;
3827
pb = pc < 0 ? -pc : pc;
3828
pc = (p + pc) < 0 ? -(p + pc) : p + pc;
3832
if (pa <= pb && pa <= pc)
3840
p = (pa <= pb && pa <=pc) ? a : (pb <= pc) ? b : c;
3842
*rp = (png_byte)(((int)(*rp) + p) & 0xff);
3849
png_warning(png_ptr, "Ignoring bad adaptive filter type");