1
/******************************************************************************
2
* lzsscomprs.cpp - code for class 'LZSSCompress'- a driver class that
3
* provides LZSS compression
6
* Copyright 2009 CrossWire Bible Society (http://www.crosswire.org)
7
* CrossWire Bible Society
11
* This program is free software; you can redistribute it and/or modify it
12
* under the terms of the GNU General Public License as published by the
13
* Free Software Foundation version 2.
15
* This program is distributed in the hope that it will be useful, but
16
* WITHOUT ANY WARRANTY; without even the implied warranty of
17
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18
* General Public License for more details.
24
#include <lzsscomprs.h>
28
/******************************************************************************
29
* LZSSCompress Statics
32
// m_ring_buffer is a text buffer. It contains "nodes" of
33
// uncompressed text that can be indexed by position. That is,
34
// a substring of the ring buffer can be indexed by a position
35
// and a length. When decoding, the compressed text may contain
36
// a position in the ring buffer and a count of the number of
37
// bytes from the ring buffer that are to be moved into the
38
// uncompressed buffer.
40
// This ring buffer is not maintained as part of the compressed
41
// text. Instead, it is reconstructed dynamically. That is,
42
// it starts out empty and gets built as the text is decompressed.
44
// The ring buffer contain N bytes, with an additional F - 1 bytes
45
// to facilitate string comparison.
47
unsigned char LZSSCompress::m_ring_buffer[N + F - 1];
49
// m_match_position and m_match_length are set by InsertNode().
51
// These variables indicate the position in the ring buffer
52
// and the number of characters at that position that match
55
short int LZSSCompress::m_match_position;
56
short int LZSSCompress::m_match_length;
58
// m_lson, m_rson, and m_dad are the Japanese way of referring to
59
// a tree structure. The dad is the parent and it has a right and
62
// For i = 0 to N-1, m_rson[i] and m_lson[i] will be the right
63
// and left children of node i.
65
// For i = 0 to N-1, m_dad[i] is the parent of node i.
67
// For i = 0 to 255, rson[N + i + 1] is the root of the tree for
68
// strings that begin with the character i. Note that this requires
69
// one byte characters.
71
// These nodes store values of 0...(N-1). Memory requirements
72
// can be reduces by using 2-byte integers instead of full 4-byte
73
// integers (for 32-bit applications). Therefore, these are
74
// defined as "short ints."
76
short int LZSSCompress::m_lson[N + 1];
77
short int LZSSCompress::m_rson[N + 257];
78
short int LZSSCompress::m_dad[N + 1];
81
/******************************************************************************
82
* LZSSCompress Constructor - Initializes data for instance of LZSSCompress
86
LZSSCompress::LZSSCompress() : SWCompress() {
90
/******************************************************************************
91
* LZSSCompress Destructor - Cleans up instance of LZSSCompress
94
LZSSCompress::~LZSSCompress() {
98
/******************************************************************************
99
* LZSSCompress::InitTree - This function initializes the tree nodes to
103
void LZSSCompress::InitTree(void) {
106
// For i = 0 to N - 1, m_rson[i] and m_lson[i] will be the right
107
// and left children of node i. These nodes need not be
108
// initialized. However, for debugging purposes, it is nice to
109
// have them initialized. Since this is only used for compression
110
// (not decompression), I don't mind spending the time to do it.
112
// For the same range of i, m_dad[i] is the parent of node i.
113
// These are initialized to a known value that can represent
114
// a "not used" state.
116
for (i = 0; i < N; i++) {
117
m_lson[i] = NOT_USED;
118
m_rson[i] = NOT_USED;
122
// For i = 0 to 255, m_rson[N + i + 1] is the root of the tree
123
// for strings that begin with the character i. This is why
124
// the right child array is larger than the left child array.
125
// These are also initialzied to a "not used" state.
127
// Note that there are 256 of these, one for each of the possible
130
for (i = N + 1; i <= (N + 256); i++) {
131
m_rson[i] = NOT_USED;
136
/******************************************************************************
137
* LZSSCompress::InsertNode - This function inserts a string from the ring
138
* buffer into one of the trees. It loads the
139
* match position and length member variables
140
* for the longest match.
142
* The string to be inserted is identified by
143
* the parameter Pos, A full F bytes are
145
* m_ring_buffer[Pos ... Pos+F-1]
148
* If the matched length is exactly F, then an
149
* old node is removed in favor of the new one
150
* (because the old one will be deleted
153
* Note that Pos plays a dual role. It is
154
* used as both a position in the ring buffer
155
* and also as a tree node.
156
* m_ring_buffer[Pos] defines a character that
157
* is used to identify a tree node.
159
* ENT: pos - position in the buffer
162
void LZSSCompress::InsertNode(short int Pos)
175
key = &(m_ring_buffer[Pos]);
177
// The last 256 entries in m_rson contain the root nodes for
178
// strings that begin with a letter. Get an index for the
179
// first letter in this string.
181
p = (short int) (N + 1 + key[0]);
183
// Set the left and right tree nodes for this position to "not
186
m_lson[Pos] = NOT_USED;
187
m_rson[Pos] = NOT_USED;
189
// Haven't matched anything yet.
195
if (m_rson[p] != NOT_USED) {
205
if (m_lson[p] != NOT_USED) {
215
// Should we go to the right or the left to look for the
218
for (i = 1; i < F; i++) {
219
cmp = key[i] - m_ring_buffer[p + i];
224
if (i > m_match_length) {
225
m_match_position = p;
233
m_dad[Pos] = m_dad[p];
234
m_lson[Pos] = m_lson[p];
235
m_rson[Pos] = m_rson[p];
237
m_dad[ m_lson[p] ] = Pos;
238
m_dad[ m_rson[p] ] = Pos;
240
if (m_rson[ m_dad[p] ] == p) {
241
m_rson[ m_dad[p] ] = Pos;
244
m_lson[ m_dad[p] ] = Pos;
253
/******************************************************************************
254
* LZSSCompress::DeleteNode - This function removes the node "Node" from the
257
* ENT: node - node to be removed
260
void LZSSCompress::DeleteNode(short int Node)
266
ASSERT(Node < (N+1));
269
if (m_dad[Node] == NOT_USED) { // not in tree, nothing to do
273
if (m_rson[Node] == NOT_USED) {
276
else if (m_lson[Node] == NOT_USED) {
281
if (m_rson[q] != NOT_USED) {
284
} while (m_rson[q] != NOT_USED);
286
m_rson[ m_dad[q] ] = m_lson[q];
287
m_dad[ m_lson[q] ] = m_dad[q];
288
m_lson[q] = m_lson[Node];
289
m_dad[ m_lson[Node] ] = q;
292
m_rson[q] = m_rson[Node];
293
m_dad[ m_rson[Node] ] = q;
296
m_dad[q] = m_dad[Node];
298
if (m_rson[ m_dad[Node] ] == Node) {
299
m_rson[ m_dad[Node] ] = q;
302
m_lson[ m_dad[Node] ] = q;
305
m_dad[Node] = NOT_USED;
309
/******************************************************************************
310
* LZSSCompress::Encode - This function "encodes" the input stream into the
312
* The GetChars() and SendChars() functions are
313
* used to separate this method from the actual
315
* NOTE: must set zlen for parent class to know length of
319
void LZSSCompress::Encode(void)
321
short int i; // an iterator
322
short int r; // node number in the binary tree
323
short int s; // position in the ring buffer
324
unsigned short int len; // len of initial string
325
short int last_match_length; // length of last match
326
short int code_buf_pos; // position in the output buffer
327
unsigned char code_buf[17]; // the output buffer
328
unsigned char mask; // bit mask for byte 0 of out buf
329
unsigned char c; // character read from string
331
// Start with a clean tree.
334
direct = 0; // set direction needed by parent [Get|Send]Chars()
336
// code_buf[0] works as eight flags. A "1" represents that the
337
// unit is an unencoded letter (1 byte), and a "0" represents
338
// that the next unit is a <position,length> pair (2 bytes).
340
// code_buf[1..16] stores eight units of code. Since the best
341
// we can do is store eight <position,length> pairs, at most 16
342
// bytes are needed to store this.
344
// This is why the maximum size of the code buffer is 17 bytes.
349
// Mask iterates over the 8 bits in the code buffer. The first
350
// character ends up being stored in the low bit.
352
// bit 8 7 6 5 4 3 2 1
354
// | first sequence in code buffer
356
// last sequence in code buffer
361
r = (short int) N - (short int) F;
363
// Initialize the ring buffer with spaces...
365
// Note that the last F bytes of the ring buffer are not filled.
366
// This is because those F bytes will be filled in immediately
367
// with bytes from the input stream.
369
memset(m_ring_buffer, ' ', N - F);
371
// Read F bytes into the last F bytes of the ring buffer.
373
// This function loads the buffer with X characters and returns
374
// the actual amount loaded.
376
len = GetChars((char *) &(m_ring_buffer[r]), F);
378
// Make sure there is something to be compressed.
383
// Insert the F strings, each of which begins with one or more
384
// 'space' characters. Note the order in which these strings
385
// are inserted. This way, degenerate trees will be less likely
388
for (i = 1; i <= F; i++) {
389
InsertNode((short int) (r - i));
392
// Finally, insert the whole string just read. The
393
// member variables match_length and match_position are set.
397
// Now that we're preloaded, continue till done.
401
// m_match_length may be spuriously long near the end of
404
if (m_match_length > len) {
405
m_match_length = len;
408
// Is it cheaper to store this as a single character? If so,
411
if (m_match_length < THRESHOLD) {
412
// Send one character. Remember that code_buf[0] is the
413
// set of flags for the next eight items.
417
code_buf[code_buf_pos++] = m_ring_buffer[r];
420
// Otherwise, we do indeed have a string that can be stored
421
// compressed to save space.
424
// The next 16 bits need to contain the position (12 bits)
425
// and the length (4 bits).
427
code_buf[code_buf_pos++] = (unsigned char) m_match_position;
428
code_buf[code_buf_pos++] = (unsigned char) (
429
((m_match_position >> 4) & 0xf0) |
430
(m_match_length - THRESHOLD) );
433
// Shift the mask one bit to the left so that it will be ready
434
// to store the new bit.
436
mask = (unsigned char) (mask << 1);
438
// If the mask is now 0, then we know that we have a full set
439
// of flags and items in the code buffer. These need to be
443
// code_buf is the buffer of characters to be output.
444
// code_buf_pos is the number of characters it contains.
446
SendChars((char *) code_buf, code_buf_pos);
448
// Reset for next buffer...
455
last_match_length = m_match_length;
457
// Delete old strings and read new bytes...
459
for (i = 0; i < last_match_length; i++) {
460
// Get next character...
462
if (GetChars((char *) &c, 1) != 1)
465
// Delete "old strings"
469
// Put this character into the ring buffer.
471
// The original comment here says "If the position is near
472
// the end of the buffer, extend the buffer to make
473
// string comparison easier."
475
// That's a little misleading, because the "end" of the
476
// buffer is really what we consider to be the "beginning"
477
// of the buffer, that is, positions 0 through F.
479
// The idea is that the front end of the buffer is duplicated
480
// into the back end so that when you're looking at characters
481
// at the back end of the buffer, you can index ahead (beyond
482
// the normal end of the buffer) and see the characters
483
// that are at the front end of the buffer wihtout having
484
// to adjust the index.
488
// 1234xxxxxxxxxxxxxxxxxxxxxxxxxxxxx1234
490
// position 0 end of buffer |
492
// duplicate of front of buffer
494
m_ring_buffer[s] = c;
497
m_ring_buffer[s + N] = c;
500
// Increment the position, and wrap around when we're at
501
// the end. Note that this relies on N being a power of 2.
503
s = (short int) ( (s + 1) & (N - 1) );
504
r = (short int) ( (r + 1) & (N - 1) );
506
// Register the string that is found in
507
// m_ring_buffer[r..r+F-1].
512
// If we didn't quit because we hit the last_match_length,
513
// then we must have quit because we ran out of characters
516
while (i++ < last_match_length) {
519
s = (short int) ( (s + 1) & (N - 1) );
520
r = (short int) ( (r + 1) & (N - 1) );
522
// Note that len hitting 0 is the key that causes the
523
// do...while() to terminate. This is the only place
524
// within the loop that len is modified.
526
// Its original value is F (or a number less than F for
530
InsertNode(r); /* buffer may not be empty. */
534
// End of do...while() loop. Continue processing until there
535
// are no more characters to be compressed. The variable
536
// "len" is used to signal this condition.
539
// There could still be something in the output buffer. Send it
542
if (code_buf_pos > 1) {
543
// code_buf is the encoded string to send.
544
// code_buf_ptr is the number of characters.
546
SendChars((char *) code_buf, code_buf_pos);
550
// must set zlen for parent class to know length of compressed buffer
555
/******************************************************************************
556
* LZSSCompress::Decode - This function "decodes" the input stream into the
558
* The GetChars() and SendChars() functions are
559
* used to separate this method from the actual
563
void LZSSCompress::Decode(void)
566
int r; // node number
567
unsigned char c[F]; // an array of chars
568
unsigned char flags; // 8 bits of flags
569
int flag_count; // which flag we're on
570
short int pos; // position in the ring buffer
571
short int len; // number of chars in ring buffer
572
unsigned long totalLen = 0;
574
direct = 1; // set direction needed by parent [Get|Send]Chars()
576
// Initialize the ring buffer with a common string.
578
// Note that the last F bytes of the ring buffer are not filled.
580
memset(m_ring_buffer, ' ', N - F);
589
// If there are more bits of interest in this flag, then
590
// shift that next interesting bit into the 1's position.
592
// If this flag has been exhausted, the next byte must
595
if (flag_count > 0) {
596
flags = (unsigned char) (flags >> 1);
600
// Next byte must be a flag.
602
if (GetChars((char *) &flags, 1) != 1)
605
// Set the flag counter. While at first it might appear
606
// that this should be an 8 since there are 8 bits in the
607
// flag, it should really be a 7 because the shift must
608
// be performed 7 times in order to see all 8 bits.
613
// If the low order bit of the flag is now set, then we know
614
// that the next byte is a single, unencoded character.
617
if (GetChars((char *) c, 1) != 1)
620
if (SendChars((char *) c, 1) != 1) {
625
// Add to buffer, and increment to next spot. Wrap at end.
627
m_ring_buffer[r] = c[0];
628
r = (short int) ( (r + 1) & (N - 1) );
631
// Otherwise, we know that the next two bytes are a
632
// <position,length> pair. The position is in 12 bits and
633
// the length is in 4 bits.
637
// if ((i = getc(infile)) == EOF)
639
// if ((j = getc(infile)) == EOF)
641
// i |= ((j & 0xf0) << 4);
642
// j = (j & 0x0f) + THRESHOLD;
644
// I've modified this to only make one input call, and
645
// have changed the variable names to something more
648
if (GetChars((char *) c, 2) != 2)
651
// Convert these two characters into the position and
652
// length. Note that the length is always at least
653
// THRESHOLD, which is why we're able to get a length
654
// of 18 out of only 4 bits.
656
pos = (short int) ( c[0] | ((c[1] & 0xf0) << 4) );
658
len = (short int) ( (c[1] & 0x0f) + THRESHOLD );
660
// There are now "len" characters at position "pos" in
661
// the ring buffer that can be pulled out. Note that
662
// len is never more than F.
664
for (k = 0; k < len; k++) {
665
c[k] = m_ring_buffer[(pos + k) & (N - 1)];
667
// Add to buffer, and increment to next spot. Wrap at end.
669
m_ring_buffer[r] = c[k];
670
r = (short int) ( (r + 1) & (N - 1) );
673
// Add the "len" :characters to the output stream.
675
if (SendChars((char *) c, len) != (unsigned int)len) {