2
* inffast.S is a hand tuned assembler version of:
4
* inffast.c -- fast decoding
5
* Copyright (C) 1995-2003 Mark Adler
6
* For conditions of distribution and use, see copyright notice in zlib.h
8
* Copyright (C) 2003 Chris Anderson <christop@charm.net>
9
* Please use the copyright conditions above.
11
* This version (Jan-23-2003) of inflate_fast was coded and tested under
12
* GNU/Linux on a pentium 3, using the gcc-3.2 compiler distribution. On that
13
* machine, I found that gzip style archives decompressed about 20% faster than
14
* the gcc-3.2 -O3 -fomit-frame-pointer compiled version. Your results will
15
* depend on how large of a buffer is used for z_stream.next_in & next_out
16
* (8K-32K worked best for my 256K cpu cache) and how much overhead there is in
17
* stream processing I/O and crc32/addler32. In my case, this routine used
18
* 70% of the cpu time and crc32 used 20%.
20
* I am confident that this version will work in the general case, but I have
21
* not tested a wide variety of datasets or a wide variety of platforms.
23
* Jan-24-2003 -- Added -DUSE_MMX define for slightly faster inflating.
24
* It should be a runtime flag instead of compile time flag...
26
* Jan-26-2003 -- Added runtime check for MMX support with cpuid instruction.
27
* With -DUSE_MMX, only MMX code is compiled. With -DNO_MMX, only non-MMX code
28
* is compiled. Without either option, runtime detection is enabled. Runtime
29
* detection should work on all modern cpus and the recomended algorithm (flip
30
* ID bit on eflags and then use the cpuid instruction) is used in many
31
* multimedia applications. Tested under win2k with gcc-2.95 and gas-2.12
32
* distributed with cygwin3. Compiling with gcc-2.95 -c inffast.S -o
33
* inffast.obj generates a COFF object which can then be linked with MSVC++
34
* compiled code. Tested under FreeBSD 4.7 with gcc-2.95.
36
* Jan-28-2003 -- Tested Athlon XP... MMX mode is slower than no MMX (and
37
* slower than compiler generated code). Adjusted cpuid check to use the MMX
38
* code only for Pentiums < P4 until I have more data on the P4. Speed
39
* improvment is only about 15% on the Athlon when compared with code generated
40
* with MSVC++. Not sure yet, but I think the P4 will also be slower using the
41
* MMX mode because many of it's x86 ALU instructions execute in .5 cycles and
42
* have less latency than MMX ops. Added code to buffer the last 11 bytes of
43
* the input stream since the MMX code grabs bits in chunks of 32, which
44
* differs from the inffast.c algorithm. I don't think there would have been
45
* read overruns where a page boundary was crossed (a segfault), but there
46
* could have been overruns when next_in ends on unaligned memory (unintialized
49
* Mar-13-2003 -- P4 MMX is slightly slower than P4 NO_MMX. I created a C
50
* version of the non-MMX code so that it doesn't depend on zstrm and zstate
51
* structure offsets which are hard coded in this file. This was last tested
52
* with zlib-1.2.0 which is currently in beta testing, newer versions of this
53
* and inffas86.c can be found at http://www.eetbeetee.com/zlib/ and
54
* http://www.charm.net/~christop/zlib/
59
* if you have underscore linking problems (_inflate_fast undefined), try
62
#if ! defined( GAS_COFF ) && ! defined( GAS_ELF )
64
#if defined( WIN32 ) || defined( __CYGWIN__ )
65
#define GAS_COFF /* windows object format */
70
#endif /* ! GAS_COFF && ! GAS_ELF */
73
#if defined( GAS_COFF )
75
/* coff externals have underscores */
76
#define inflate_fast _inflate_fast
77
#define inflate_fast_use_mmx _inflate_fast_use_mmx
88
.L_invalid_literal_length_code_msg:
89
.string "invalid literal/length code"
92
.L_invalid_distance_code_msg:
93
.string "invalid distance code"
96
.L_invalid_distance_too_far_msg:
97
.string "invalid distance too far back"
99
#if ! defined( NO_MMX )
101
.L_mask: /* mask[N] = ( 1 << N ) - 1 */
140
* struct z_stream offsets, in zlib.h
142
#define next_in_strm 0 /* strm->next_in */
143
#define avail_in_strm 4 /* strm->avail_in */
144
#define next_out_strm 12 /* strm->next_out */
145
#define avail_out_strm 16 /* strm->avail_out */
146
#define msg_strm 24 /* strm->msg */
147
#define state_strm 28 /* strm->state */
150
* struct inflate_state offsets, in inflate.h
152
#define mode_state 0 /* state->mode */
153
#define wsize_state 32 /* state->wsize */
154
#define write_state 40 /* state->write */
155
#define window_state 44 /* state->window */
156
#define hold_state 48 /* state->hold */
157
#define bits_state 52 /* state->bits */
158
#define lencode_state 68 /* state->lencode */
159
#define distcode_state 72 /* state->distcode */
160
#define lenbits_state 76 /* state->lenbits */
161
#define distbits_state 80 /* state->distbits */
164
* inflate_fast's activation record
166
#define local_var_size 64 /* how much local space for vars */
167
#define strm_sp 88 /* first arg: z_stream * (local_var_size + 24) */
168
#define start_sp 92 /* second arg: unsigned int (local_var_size + 28) */
171
* offsets for local vars on stack
173
#define out 60 /* unsigned char* */
174
#define window 56 /* unsigned char* */
175
#define wsize 52 /* unsigned int */
176
#define write 48 /* unsigned int */
177
#define in 44 /* unsigned char* */
178
#define beg 40 /* unsigned char* */
179
#define buf 28 /* char[ 12 ] */
180
#define len 24 /* unsigned int */
181
#define last 20 /* unsigned char* */
182
#define end 16 /* unsigned char* */
183
#define dcode 12 /* code* */
184
#define lcode 8 /* code* */
185
#define dmask 4 /* unsigned int */
186
#define lmask 0 /* unsigned int */
189
* typedef enum inflate_mode consts, in inflate.h
191
#define INFLATE_MODE_TYPE 11 /* state->mode flags enum-ed in inflate.h */
192
#define INFLATE_MODE_BAD 26
195
#if ! defined( USE_MMX ) && ! defined( NO_MMX )
201
#define DONT_USE_MMX 3
203
.globl inflate_fast_use_mmx
208
inflate_fast_use_mmx: /* integer flag for run time control 1=check,2=mmx,3=no */
211
#if defined( GAS_ELF )
213
.type inflate_fast_use_mmx,@object
214
.size inflate_fast_use_mmx,4
217
#endif /* RUN_TIME_MMX */
219
#if defined( GAS_COFF )
220
/* coff info: scl 2 = extern, type 32 = function */
221
.def inflate_fast; .scl 2; .type 32; .endef
232
pushf /* save eflags (strm_sp, state_sp assumes this is 32 bits) */
233
subl $local_var_size, %esp
239
movl strm_sp(%esp), strm_r
240
movl state_strm(strm_r), state_r
242
/* in = strm->next_in;
243
* out = strm->next_out;
244
* last = in + strm->avail_in - 11;
245
* beg = out - (start - strm->avail_out);
246
* end = out + (strm->avail_out - 257);
248
movl avail_in_strm(strm_r), %edx
249
movl next_in_strm(strm_r), %eax
251
addl %eax, %edx /* avail_in += next_in */
252
subl $11, %edx /* avail_in -= 11 */
255
movl %edx, last(%esp)
257
movl start_sp(%esp), %ebp
258
movl avail_out_strm(strm_r), %ecx
259
movl next_out_strm(strm_r), %ebx
261
subl %ecx, %ebp /* start -= avail_out */
262
negl %ebp /* start = -start */
263
addl %ebx, %ebp /* start += next_out */
265
subl $257, %ecx /* avail_out -= 257 */
266
addl %ebx, %ecx /* avail_out += out */
272
/* wsize = state->wsize;
273
* write = state->write;
274
* window = state->window;
275
* hold = state->hold;
276
* bits = state->bits;
277
* lcode = state->lencode;
278
* dcode = state->distcode;
279
* lmask = ( 1 << state->lenbits ) - 1;
280
* dmask = ( 1 << state->distbits ) - 1;
283
movl lencode_state(state_r), %eax
284
movl distcode_state(state_r), %ecx
286
movl %eax, lcode(%esp)
287
movl %ecx, dcode(%esp)
290
movl lenbits_state(state_r), %ecx
293
movl %eax, lmask(%esp)
296
movl distbits_state(state_r), %ecx
299
movl %eax, dmask(%esp)
301
movl wsize_state(state_r), %eax
302
movl write_state(state_r), %ecx
303
movl window_state(state_r), %edx
305
movl %eax, wsize(%esp)
306
movl %ecx, write(%esp)
307
movl %edx, window(%esp)
309
movl hold_state(state_r), %ebp
310
movl bits_state(state_r), %ebx
320
movl last(%esp), %ecx
322
ja .L_align_long /* if in < last */
324
addl $11, %ecx /* ecx = &in[ avail_in ] */
325
subl in_r, %ecx /* ecx = avail_in */
327
subl %ecx, %eax /* eax = 12 - avail_in */
329
rep movsb /* memcpy( buf, in, avail_in ) */
332
rep stosb /* memset( &buf[ avail_in ], 0, 12 - avail_in ) */
333
leal buf(%esp), in_r /* in = buf */
334
movl in_r, last(%esp) /* last = in, do just one iteration */
337
/* align in_r on long boundary */
351
movl out(%esp), out_r
353
#if defined( NO_MMX )
357
#if defined( USE_MMX )
361
/*** Runtime MMX check ***/
363
#if defined( RUN_TIME_MMX )
365
cmpl $DO_USE_MMX, inflate_fast_use_mmx
367
ja .L_do_loop /* > 2 */
374
movl (%esp), %eax /* copy eflags to eax */
375
xorl $0x200000, (%esp) /* try toggling ID bit of eflags (bit 21)
376
* to see if cpu supports cpuid...
377
* ID bit method not supported by NexGen but
378
* bios may load a cpuid instruction and
379
* cpuid may be disabled on Cyrix 5-6x86 */
382
popl %edx /* copy new eflags to edx */
383
xorl %eax, %edx /* test if ID bit is flipped */
384
jz .L_dont_use_mmx /* not flipped if zero */
387
cmpl $0x756e6547, %ebx /* check for GenuineIntel in ebx,ecx,edx */
389
cmpl $0x6c65746e, %ecx
391
cmpl $0x49656e69, %edx
394
cpuid /* get cpu features */
397
cmpl $6, %eax /* check for Pentium family, is 0xf for P4 */
399
testl $0x800000, %edx /* test if MMX feature is set (bit 23) */
403
movl $DO_USE_MMX, inflate_fast_use_mmx
406
movl $DONT_USE_MMX, inflate_fast_use_mmx
416
/*** Non-MMX code ***/
418
#if defined ( NO_MMX ) || defined( RUN_TIME_MMX )
422
#define bitslong_r %ebx
426
/* while (in < last && out < end)
428
cmpl out_r, end(%esp)
429
jbe .L_break_loop /* if (out >= end) */
431
cmpl in_r, last(%esp)
435
/* regs: %esi = in, %ebp = hold, %bl = bits, %edi = out
439
* hold |= *((unsigned short *)in)++ << bits;
442
* this = lcode[hold & lmask]
445
ja .L_get_length_code /* if (15 < bits) */
448
lodsw /* al = *(ushort *)in++ */
449
movb bits_r, %cl /* cl = bits, needs it for shifting */
450
addb $16, bits_r /* bits += 16 */
452
orl %eax, hold_r /* hold |= *((ushort *)in)++ << bits */
455
movl lmask(%esp), %edx /* edx = lmask */
456
movl lcode(%esp), %ecx /* ecx = lcode */
457
andl hold_r, %edx /* edx &= hold */
458
movl (%ecx,%edx,4), %eax /* eax = lcode[hold & lmask] */
461
/* regs: %esi = in, %ebp = hold, %bl = bits, %edi = out
467
movb %ah, %cl /* cl = this.bits */
468
subb %ah, bits_r /* bits -= this.bits */
469
shrl %cl, hold_r /* hold >>= this.bits */
471
/* check if op is a literal
473
* PUP(out) = this.val;
477
jnz .L_test_for_length_base /* if (op != 0) 45.7% */
479
shrl $16, %eax /* output this.val char */
483
.L_test_for_length_base:
484
/* regs: %esi = in, %ebp = hold, %bl = bits, %edi = out, %edx = len
486
* else if (op & 16) {
491
* hold |= *((unsigned short *)in)++ << bits;
494
* len += hold & mask[op];
500
movl %eax, len_r /* len = this */
501
shrl $16, len_r /* len = this.val */
505
jz .L_test_for_second_level_length /* if ((op & 16) == 0) 8% */
506
andb $15, %cl /* op &= 15 */
507
jz .L_save_len /* if (!op) */
509
jae .L_add_bits_to_len /* if (op <= bits) */
511
movb %cl, %ch /* stash op in ch, freeing cl */
513
lodsw /* al = *(ushort *)in++ */
514
movb bits_r, %cl /* cl = bits, needs it for shifting */
515
addb $16, bits_r /* bits += 16 */
517
orl %eax, hold_r /* hold |= *((ushort *)in)++ << bits */
518
movb %ch, %cl /* move op back to ecx */
525
andl hold_r, %eax /* eax &= hold */
527
addl %eax, len_r /* len += hold & mask[op] */
530
movl len_r, len(%esp) /* save len */
534
/* regs: %esi = in, %ebp = hold, %bl = bits, %edi = out, %edx = dist
537
* hold |= *((unsigned short *)in)++ << bits;
540
* this = dcode[hold & dmask];
543
* hold >>= this.bits;
548
ja .L_get_distance_code /* if (15 < bits) */
551
lodsw /* al = *(ushort *)in++ */
552
movb bits_r, %cl /* cl = bits, needs it for shifting */
553
addb $16, bits_r /* bits += 16 */
555
orl %eax, hold_r /* hold |= *((ushort *)in)++ << bits */
557
.L_get_distance_code:
558
movl dmask(%esp), %edx /* edx = dmask */
559
movl dcode(%esp), %ecx /* ecx = dcode */
560
andl hold_r, %edx /* edx &= hold */
561
movl (%ecx,%edx,4), %eax /* eax = dcode[hold & dmask] */
565
movl %eax, dist_r /* dist = this */
566
shrl $16, dist_r /* dist = this.val */
568
subb %ah, bits_r /* bits -= this.bits */
569
shrl %cl, hold_r /* hold >>= this.bits */
575
* hold |= *((unsigned short *)in)++ << bits;
578
* dist += hold & mask[op];
582
movb %al, %cl /* cl = this.op */
584
testb $16, %al /* if ((op & 16) == 0) */
585
jz .L_test_for_second_level_dist
586
andb $15, %cl /* op &= 15 */
589
jae .L_add_bits_to_dist /* if (op <= bits) 97.6% */
591
movb %cl, %ch /* stash op in ch, freeing cl */
593
lodsw /* al = *(ushort *)in++ */
594
movb bits_r, %cl /* cl = bits, needs it for shifting */
595
addb $16, bits_r /* bits += 16 */
597
orl %eax, hold_r /* hold |= *((ushort *)in)++ << bits */
598
movb %ch, %cl /* move op back to ecx */
603
decl %eax /* (1 << op) - 1 */
605
andl hold_r, %eax /* eax &= hold */
607
addl %eax, dist_r /* dist += hold & ((1 << op) - 1) */
611
/* regs: %esi = from, %ebp = hold, %bl = bits, %edi = out, %edx = dist
614
* nbytes = out - beg;
615
* if (dist <= nbytes) {
618
* PUP(out) = PUP(from);
619
* } while (--len > 0) {
623
movl in_r, in(%esp) /* save in so from can use it's reg */
625
subl beg(%esp), %eax /* nbytes = out - beg */
628
jb .L_clip_window /* if (dist > nbytes) 4.2% */
632
subl dist_r, from_r /* from = out - dist */
645
movl in(%esp), in_r /* move in back to %esi, toss from */
652
cmpl out_r, beg(%esp)
669
.L_test_for_second_level_length:
670
/* else if ((op & 64) == 0) {
671
* this = lcode[this.val + (hold & mask[op])];
675
jnz .L_test_for_end_of_block /* if ((op & 64) != 0) */
680
andl hold_r, %eax /* eax &= hold */
681
addl %edx, %eax /* eax += this.val */
682
movl lcode(%esp), %edx /* edx = lcode */
683
movl (%edx,%eax,4), %eax /* eax = lcode[val + (hold&mask[op])] */
687
.L_test_for_second_level_dist:
688
/* else if ((op & 64) == 0) {
689
* this = dcode[this.val + (hold & mask[op])];
693
jnz .L_invalid_distance_code /* if ((op & 64) != 0) */
698
andl hold_r, %eax /* eax &= hold */
699
addl %edx, %eax /* eax += this.val */
700
movl dcode(%esp), %edx /* edx = dcode */
701
movl (%edx,%eax,4), %eax /* eax = dcode[val + (hold&mask[op])] */
706
/* regs: %esi = from, %ebp = hold, %bl = bits, %edi = out, %edx = dist
710
* if (dist > wsize) {
714
* nbytes = dist - nbytes;
716
* from += wsize - nbytes;
718
#define nbytes_r %ecx
720
movl wsize(%esp), %eax /* prepare for dist compare */
721
negl nbytes_r /* nbytes = -nbytes */
722
movl window(%esp), from_r /* from = window */
725
jb .L_invalid_distance_too_far /* if (dist > wsize) */
727
addl dist_r, nbytes_r /* nbytes = dist - nbytes */
729
jne .L_wrap_around_window /* if (write != 0) */
732
addl %eax, from_r /* from += wsize - nbytes */
734
/* regs: %esi = from, %ebp = hold, %bl = bits, %edi = out, %edx = dist
735
* %ecx = nbytes, %eax = len
737
* if (nbytes < len) {
740
* PUP(out) = PUP(from);
741
* } while (--nbytes);
747
movl len(%esp), len_r
749
jbe .L_do_copy1 /* if (nbytes >= len) */
751
subl nbytes_r, len_r /* len -= nbytes */
754
subl dist_r, from_r /* from = out - dist */
758
jbe .L_do_copy1 /* if (nbytes >= len) */
760
subl nbytes_r, len_r /* len -= nbytes */
763
subl dist_r, from_r /* from = out - dist */
766
.L_wrap_around_window:
767
/* regs: %esi = from, %ebp = hold, %bl = bits, %edi = out, %edx = dist
768
* %ecx = nbytes, %eax = write, %eax = len
770
* else if (write < nbytes) {
771
* from += wsize + write - nbytes;
773
* if (nbytes < len) {
776
* PUP(out) = PUP(from);
777
* } while (--nbytes);
780
* if (nbytes < len) {
783
* PUP(out) = PUP(from);
791
movl write(%esp), write_r
792
cmpl write_r, nbytes_r
793
jbe .L_contiguous_in_window /* if (write >= nbytes) */
795
addl wsize(%esp), from_r
797
subl nbytes_r, from_r /* from += wsize + write - nbytes */
798
subl write_r, nbytes_r /* nbytes -= write */
801
movl len(%esp), len_r
803
jbe .L_do_copy1 /* if (nbytes >= len) */
805
subl nbytes_r, len_r /* len -= nbytes */
807
movl window(%esp), from_r /* from = window */
808
movl write(%esp), nbytes_r /* nbytes = write */
810
jbe .L_do_copy1 /* if (nbytes >= len) */
812
subl nbytes_r, len_r /* len -= nbytes */
815
subl dist_r, from_r /* from = out - dist */
818
.L_contiguous_in_window:
819
/* regs: %esi = from, %ebp = hold, %bl = bits, %edi = out, %edx = dist
820
* %ecx = nbytes, %eax = write, %eax = len
823
* from += write - nbytes;
824
* if (nbytes < len) {
827
* PUP(out) = PUP(from);
828
* } while (--nbytes);
835
subl nbytes_r, from_r /* from += write - nbytes */
838
movl len(%esp), len_r
840
jbe .L_do_copy1 /* if (nbytes >= len) */
842
subl nbytes_r, len_r /* len -= nbytes */
845
subl dist_r, from_r /* from = out - dist */
848
/* regs: %esi = from, %esi = in, %ebp = hold, %bl = bits, %edi = out
852
* PUP(out) = PUP(from);
856
* } while (in < last && out < end);
863
movl in(%esp), in_r /* move in back to %esi, toss from */
869
#endif /* NO_MMX || RUN_TIME_MMX */
874
#if defined( USE_MMX ) || defined( RUN_TIME_MMX )
882
#define bitslong_r %ebp
885
movl %ebx, bitslong_r
888
#define dmask2_mm %mm2
889
#define lmask2_mm %mm3
890
#define lmask_mm %mm4
891
#define dmask_mm %mm5
894
movd lmask(%esp), lmask_mm
895
movq lmask_mm, lmask2_mm
896
movd dmask(%esp), dmask_mm
897
movq dmask_mm, dmask2_mm
898
pxor used_mm, used_mm
899
movl lcode(%esp), %ebx /* ebx = lcode */
904
/* while (in < last && out < end)
906
cmpl out_r, end(%esp)
907
jbe .L_break_loop /* if (out >= end) */
909
cmpl in_r, last(%esp)
913
psrlq used_mm, hold_mm /* hold_mm >>= last bit length */
916
ja .L_get_length_code_mmx /* if (32 < bits) */
918
movd bitslong_r, tmp_mm
923
por %mm7, hold_mm /* hold_mm |= *((uint *)in)++ << bits */
925
.L_get_length_code_mmx:
926
pand hold_mm, lmask_mm
928
movq lmask2_mm, lmask_mm
929
movl (%ebx,%eax,4), %eax /* eax = lcode[hold & lmask] */
932
movzbl %ah, %ecx /* ecx = this.bits */
934
subl %ecx, bitslong_r /* bits -= this.bits */
937
jnz .L_test_for_length_base_mmx /* if (op != 0) 45.7% */
939
shrl $16, %eax /* output this.val char */
941
jmp .L_while_test_mmx
943
.L_test_for_length_base_mmx:
945
movl %eax, len_r /* len = this */
946
shrl $16, len_r /* len = this.val */
949
jz .L_test_for_second_level_length_mmx /* if ((op & 16) == 0) 8% */
950
andl $15, %eax /* op &= 15 */
951
jz .L_decode_distance_mmx /* if (!op) */
953
psrlq used_mm, hold_mm /* hold_mm >>= last bit length */
956
subl %eax, bitslong_r
957
andl .L_mask(,%eax,4), %ecx
958
addl %ecx, len_r /* len += hold & mask[op] */
960
.L_decode_distance_mmx:
961
psrlq used_mm, hold_mm /* hold_mm >>= last bit length */
964
ja .L_get_dist_code_mmx /* if (32 < bits) */
966
movd bitslong_r, tmp_mm
971
por %mm7, hold_mm /* hold_mm |= *((uint *)in)++ << bits */
973
.L_get_dist_code_mmx:
974
movl dcode(%esp), %ebx /* ebx = dcode */
975
pand hold_mm, dmask_mm
977
movq dmask2_mm, dmask_mm
978
movl (%ebx,%eax,4), %eax /* eax = dcode[hold & lmask] */
982
movzbl %ah, %ecx /* ecx = this.bits */
984
shrl $16, dist_r /* dist = this.val */
985
subl %ecx, bitslong_r /* bits -= this.bits */
988
testb $16, %al /* if ((op & 16) == 0) */
989
jz .L_test_for_second_level_dist_mmx
990
andl $15, %eax /* op &= 15 */
991
jz .L_check_dist_one_mmx
993
.L_add_bits_to_dist_mmx:
994
psrlq used_mm, hold_mm /* hold_mm >>= last bit length */
995
movd %eax, used_mm /* save bit length of current op */
996
movd hold_mm, %ecx /* get the next bits on input stream */
997
subl %eax, bitslong_r /* bits -= op bits */
998
andl .L_mask(,%eax,4), %ecx /* ecx = hold & mask[op] */
999
addl %ecx, dist_r /* dist += hold & mask[op] */
1001
.L_check_window_mmx:
1002
movl in_r, in(%esp) /* save in so from can use it's reg */
1004
subl beg(%esp), %eax /* nbytes = out - beg */
1007
jb .L_clip_window_mmx /* if (dist > nbytes) 4.2% */
1011
subl dist_r, from_r /* from = out - dist */
1024
movl in(%esp), in_r /* move in back to %esi, toss from */
1025
movl lcode(%esp), %ebx /* move lcode back to %ebx, toss dist */
1026
jmp .L_while_test_mmx
1029
.L_check_dist_one_mmx:
1031
jne .L_check_window_mmx
1032
cmpl out_r, beg(%esp)
1033
je .L_check_window_mmx
1046
movl lcode(%esp), %ebx /* move lcode back to %ebx, toss dist */
1047
jmp .L_while_test_mmx
1050
.L_test_for_second_level_length_mmx:
1052
jnz .L_test_for_end_of_block /* if ((op & 64) != 0) */
1055
psrlq used_mm, hold_mm /* hold_mm >>= last bit length */
1057
andl .L_mask(,%eax,4), %ecx
1059
movl (%ebx,%ecx,4), %eax /* eax = lcode[hold & lmask] */
1063
.L_test_for_second_level_dist_mmx:
1065
jnz .L_invalid_distance_code /* if ((op & 64) != 0) */
1068
psrlq used_mm, hold_mm /* hold_mm >>= last bit length */
1070
andl .L_mask(,%eax,4), %ecx
1071
movl dcode(%esp), %eax /* ecx = dcode */
1073
movl (%eax,%ecx,4), %eax /* eax = lcode[hold & lmask] */
1078
#define nbytes_r %ecx
1080
movl wsize(%esp), %eax /* prepare for dist compare */
1081
negl nbytes_r /* nbytes = -nbytes */
1082
movl window(%esp), from_r /* from = window */
1085
jb .L_invalid_distance_too_far /* if (dist > wsize) */
1087
addl dist_r, nbytes_r /* nbytes = dist - nbytes */
1088
cmpl $0, write(%esp)
1089
jne .L_wrap_around_window_mmx /* if (write != 0) */
1092
addl %eax, from_r /* from += wsize - nbytes */
1094
cmpl nbytes_r, len_r
1095
jbe .L_do_copy1_mmx /* if (nbytes >= len) */
1097
subl nbytes_r, len_r /* len -= nbytes */
1100
subl dist_r, from_r /* from = out - dist */
1103
cmpl nbytes_r, len_r
1104
jbe .L_do_copy1_mmx /* if (nbytes >= len) */
1106
subl nbytes_r, len_r /* len -= nbytes */
1109
subl dist_r, from_r /* from = out - dist */
1112
.L_wrap_around_window_mmx:
1113
#define write_r %eax
1114
movl write(%esp), write_r
1115
cmpl write_r, nbytes_r
1116
jbe .L_contiguous_in_window_mmx /* if (write >= nbytes) */
1118
addl wsize(%esp), from_r
1119
addl write_r, from_r
1120
subl nbytes_r, from_r /* from += wsize + write - nbytes */
1121
subl write_r, nbytes_r /* nbytes -= write */
1124
cmpl nbytes_r, len_r
1125
jbe .L_do_copy1_mmx /* if (nbytes >= len) */
1127
subl nbytes_r, len_r /* len -= nbytes */
1129
movl window(%esp), from_r /* from = window */
1130
movl write(%esp), nbytes_r /* nbytes = write */
1131
cmpl nbytes_r, len_r
1132
jbe .L_do_copy1_mmx /* if (nbytes >= len) */
1134
subl nbytes_r, len_r /* len -= nbytes */
1137
subl dist_r, from_r /* from = out - dist */
1140
.L_contiguous_in_window_mmx:
1141
#define write_r %eax
1142
addl write_r, from_r
1143
subl nbytes_r, from_r /* from += write - nbytes */
1146
cmpl nbytes_r, len_r
1147
jbe .L_do_copy1_mmx /* if (nbytes >= len) */
1149
subl nbytes_r, len_r /* len -= nbytes */
1152
subl dist_r, from_r /* from = out - dist */
1160
movl in(%esp), in_r /* move in back to %esi, toss from */
1161
movl lcode(%esp), %ebx /* move lcode back to %ebx, toss dist */
1162
jmp .L_while_test_mmx
1167
#endif /* USE_MMX || RUN_TIME_MMX */
1170
/*** USE_MMX, NO_MMX, and RUNTIME_MMX from here on ***/
1172
.L_invalid_distance_code:
1174
* strm->msg = "invalid distance code";
1175
* state->mode = BAD;
1178
movl $.L_invalid_distance_code_msg, %ecx
1179
movl $INFLATE_MODE_BAD, %edx
1180
jmp .L_update_stream_state
1182
.L_test_for_end_of_block:
1183
/* else if (op & 32) {
1184
* state->mode = TYPE;
1189
jz .L_invalid_literal_length_code /* if ((op & 32) == 0) */
1192
movl $INFLATE_MODE_TYPE, %edx
1193
jmp .L_update_stream_state
1195
.L_invalid_literal_length_code:
1197
* strm->msg = "invalid literal/length code";
1198
* state->mode = BAD;
1201
movl $.L_invalid_literal_length_code_msg, %ecx
1202
movl $INFLATE_MODE_BAD, %edx
1203
jmp .L_update_stream_state
1205
.L_invalid_distance_too_far:
1206
/* strm->msg = "invalid distance too far back";
1207
* state->mode = BAD;
1209
movl in(%esp), in_r /* from_r has in's reg, put in back */
1210
movl $.L_invalid_distance_too_far_msg, %ecx
1211
movl $INFLATE_MODE_BAD, %edx
1212
jmp .L_update_stream_state
1214
.L_update_stream_state:
1215
/* set strm->msg = %ecx, strm->state->mode = %edx */
1216
movl strm_sp(%esp), %eax
1217
testl %ecx, %ecx /* if (msg != NULL) */
1219
movl %ecx, msg_strm(%eax) /* strm->msg = msg */
1221
movl state_strm(%eax), %eax /* state = strm->state */
1222
movl %edx, mode_state(%eax) /* state->mode = edx (BAD | TYPE) */
1231
* bits = %ebp when mmx, and in %ebx when non-mmx
1232
* hold = %hold_mm when mmx, and in %ebp when non-mmx
1237
#if defined( USE_MMX ) || defined( RUN_TIME_MMX )
1239
#if defined( RUN_TIME_MMX )
1241
cmpl $DO_USE_MMX, inflate_fast_use_mmx
1242
jne .L_update_next_in
1244
#endif /* RUN_TIME_MMX */
1253
#define state_r %edx
1258
* hold &= (1U << bits) - 1;
1259
* state->hold = hold;
1260
* state->bits = bits;
1261
* strm->next_in = in;
1262
* strm->next_out = out;
1264
movl strm_sp(%esp), strm_r
1266
movl state_strm(strm_r), state_r
1271
movl out_r, next_out_strm(strm_r)
1272
movl %ebx, bits_state(state_r)
1275
leal buf(%esp), %ebx
1276
cmpl %ebx, last(%esp)
1277
jne .L_buf_not_used /* if buf != last */
1279
subl %ebx, in_r /* in -= buf */
1280
movl next_in_strm(strm_r), %ebx
1281
movl %ebx, last(%esp) /* last = strm->next_in */
1282
addl %ebx, in_r /* in += strm->next_in */
1283
movl avail_in_strm(strm_r), %ebx
1285
addl %ebx, last(%esp) /* last = &strm->next_in[ avail_in - 11 ] */
1288
movl in_r, next_in_strm(strm_r)
1294
#if defined( USE_MMX ) || defined( RUN_TIME_MMX )
1296
#if defined( RUN_TIME_MMX )
1298
cmpl $DO_USE_MMX, inflate_fast_use_mmx
1301
#endif /* RUN_TIME_MMX */
1303
psrlq used_mm, hold_mm /* hold_mm >>= last bit length */
1310
#endif /* USE_MMX || RUN_TIME_MMX */
1313
movl %ebp, hold_state(state_r)
1317
/* strm->avail_in = in < last ? 11 + (last - in) : 11 - (in - last) */
1318
movl last(%esp), last_r
1320
jbe .L_last_is_smaller /* if (in >= last) */
1322
subl in_r, last_r /* last -= in */
1323
addl $11, last_r /* last += 11 */
1324
movl last_r, avail_in_strm(strm_r)
1327
subl last_r, in_r /* in -= last */
1328
negl in_r /* in = -in */
1329
addl $11, in_r /* in += 11 */
1330
movl in_r, avail_in_strm(strm_r)
1336
/* strm->avail_out = out < end ? 257 + (end - out) : 257 - (out - end)*/
1337
movl end(%esp), end_r
1339
jbe .L_end_is_smaller /* if (out >= end) */
1341
subl out_r, end_r /* end -= out */
1342
addl $257, end_r /* end += 257 */
1343
movl end_r, avail_out_strm(strm_r)
1346
subl end_r, out_r /* out -= end */
1347
negl out_r /* out = -out */
1348
addl $257, out_r /* out += 257 */
1349
movl out_r, avail_out_strm(strm_r)
1356
addl $local_var_size, %esp
1364
#if defined( GAS_ELF )
1366
.type inflate_fast,@function
1367
.size inflate_fast,.-inflate_fast