420 Intermediate C Programming
}10
printf (" n" );11
}12
// write one bit to a file13
//14
// whichbi t indicat e s which bit this is written to15
// (0 means leftmost , 7 means rightm ost )16
//17
// curbyte is the c u r rent byte18
//19
// if whichbit is zero , curbyte is reset and bit is put20
// to the leftmost bit21
//22
// when whichbit reaches 7, this byte is written to the23
// file and whichbit is reset24
//25
// the functi o n returns 1 if a byte is written to the file26
// returns 0 if no byte is written27
// -1 if it tries to write and fails28
int writeBit ( FILE * fptr , unsigned char bit ,29
unsigned char * whichbit ,30
unsigned char * curbyte )31
{32
i f ((* whichbit ) == 0)33
{34
// reset35
* curbyte = 0;36
}37
// shift the bit to the correct location38
unsigned char temp = bit << (7 - (* whichbit )) ;39
* curbyte |= temp ; // store the data40
int value = 0;41
i f ((* whichbit ) == 7)42
{43
int ret ;44
ret = fwrite ( curbyte , s i z e o f ( unsigned char ) , 1, fptr );45
// prin t Byte (* curbyte ); // for d e bugging46
i f ( ret == 1)47
{48
value = 1;49
}50
e l s e51
{52
value = -1;53
}54
}55
* whichbit = ((* whic h bit ) + 1) % 8;56
return value ;57
}58
Huffman Compression 421
This function writes the tree (the header of the file) to the file. When a leaf node is
visited, one command bit (value is 1) is written to the file, followed by the 7 bits of the
character. When a non-leaf node is visited, one commend bit (value is 0) is written to the
file. If some bits of the last byte are not used, these bits are set to zero. The new line
character ends the header.
// print the 7 bits of an ASCII charac ter1
s t a t i c void char2bits ( FILE * outfptr , i n t ch ,2
unsigned char * whichbit ,3
unsigned char * curbyte )4
{5
unsigned char mask = 0 x40 ; // only 7 bits6
while ( mask > 0)7
{8
writeBit ( outfptr , ( ch & mask ) == mask ,9
whichbit , curbyte );10
mask >>= 1;11
}12
}13
s t a t i c void Tree_ heade rHelp er ( TreeNode * tn , FILE * outfptr ,14
unsigned char * whichbit ,15
unsigned char * curbyte )16
{17
i f ( tn == NULL )18
{19
return ;20
}21
TreeNode * lc = tn -> left ;22
TreeNode * rc = tn -> right ;23
i f (( lc == NULL ) && ( rc == NULL ) )24
{25
// leaf node26
writeBit ( outfptr , 1 , whichbit , curbyte ) ;27
char2bits ( outfptr , tn -> value , whichbit , c u r b y t e ) ;28
return ;29
}30
Tree _head erHel per ( lc , outfptr , whichbit , curbyte );31
Tree _head erHel per ( rc , outfptr , whichbit , curbyte );32
writeBit ( outfptr , 0 , whichbit , curbyte ) ;33
}34
void Tree_he ader ( TreeNode * tn , char * outfile )35
{36
FILE * outfptr = fopen ( outfile , "w ");37
i f ( outfptr == NULL )38
{39
return ;40
}41
unsigned char whichbit = 0;42
unsigned char curbyte = 0;43
Tree _head erHel per ( tn , outfptr , & whichbit , & curbyte );44
while ( whichbit != 0)45
422 Intermediate C Programming
{46
// if the current byte has unused bits47
writeBit ( outfptr , 0 , & whichbit , & curby t e );48
}49
unsigned char newline = n ; // add n at the end50
fwrite (& newline , s i z e o f ( unsigned char ) , 1, outfptr );51
fclose ( outfptr );52
}53
This is the compress function. It writes the bits to the file. If some bits of the last byte
is unused, these bits are zero.
int compress ( char * infile , char * outfile ,1
int * * codebook , i n t * mapping )2
{3
FILE * infptr = fopen ( infile , "r ") ;4
i f ( infptr == NULL )5
{6
return 0;7
}8
FILE * outfptr = fopen ( outfile , "a "); // append9
i f ( outfptr == NULL )10
{11
fclose ( outfptr );12
return 0;13
}14
unsigned char whichbit = 0;15
unsigned char curbyte = 0;16
while (! feof ( infptr ) )17
{18
int onechar = fgetc ( infptr ) ;19
i f ( onechar != EOF )20
{21
int ind = mapping [ o n e c h a r ];22
int ind2 = 1;23
while ( codebook [ ind ][ ind2 ] != -1)24
{25
writeBit ( outfptr , ( codebook [ ind ][ ind2 ] == 1) ,26
& whichbit , & curbyte );27
ind2 ++;28
}29
}30
}31
while ( whichbit != 0)32
{33
// if the current byte has unused bits34
writeBit ( outfptr , 0 , & whichbit , & curby t e );35
}36
fclose ( infptr ) ;37
fclose ( outfptr );38
return 1;39
}40
Huffman Compression 423
After writing the code book, the header uses the next 4 bytes (32 bits) to write the
length of the article. This is an unsigned integer and can be as large as 2
32
1, more than
four billion. A 1000-page novel has about 500,000 words, a few million characters. Thus, 32
bits are sufficient. After the length, a new line character is written to the file, signifying the
end of the header.
The output file cannot be viewed easily in a text editor, because the data is compressed
in bit representations. We can use the xxd program in Linux to see the hexadecimal values
that represent each byte in the file. The following shows the compressed file in hexadecimal
format using xxd.
0000000: e8e4 f0d4 c5ce 639c e0b2 0000 000a e9d7 ......c.........
0000010: af7d f7df 7df7 ce73 9ce7 3fff ffff ffff .}..}..s..?.....
0000020: ffff ffff 6db6 db6d b6db 6db6 d555 5555 ....m..m..m..UUU
0000030: 5555 5555 5555 5554 0000 0000 0000 0000 UUUUUUUT........
0000040: 00
The following shows the compressed file in binary format using
0000000: 11101000 11100100 11110000 11010100 11000101 11001110 ......
0000006: 01100011 10011100 11100000 10110010 00000000 00000000 c.....
000000c: 00000000 00001010 11101001 11010111 10101111 01111101 .....}
0000012: 11110111 11011111 01111101 11110111 11001110 01110011 ..}..s
0000018: 10011100 11100111 00111111 11111111 11111111 11111111 ..?...
000001e: 11111111 11111111 11111111 11111111 11111111 11111111 ......
0000024: 01101101 10110110 11011011 01101101 10110110 11011011 m..m..
000002a: 01101101 10110110 11010101 01010101 01010101 01010101 m..UUU
0000030: 01010101 01010101 01010101 01010101 01010101 01010101 UUUUUU
0000036: 01010101 01010100 00000000 00000000 00000000 00000000 UT....
000003c: 00000000 00000000 00000000 00000000 00000000 .....
24.3 Decoding
Decoding is the reverse of encoding. The decoder first reconstructs the code tree from
the file header, and then reads the compressed codes of the characters. From the codes, the
decompresser traverses the code tree and outputs the characters stored in the tree’s leaf
nodes. To reconstruct the tree, the decoder needs to know how the tree is represented in the
header. In our case, the code book is encoded using the rules described in Section 24.2.5.
The header contains both commands (1 bit, either 0 or 1) and characters (7 bits). To build
the code tree, the decoder does the following:
The first bit is a command bit and it is always 1.
If a command is 1, then the next 7 bits are the value stored in a leaf node. Create a
tree node to store this value. Add this tree node to the beginning of the list. This tree
node is a single-node tree.
If a command is 0 and the list has only one node, then the complete tree has been
built. If a command is 0 and the list has two or more nodes, then take the first two
nodes from the list, create a tree node as the parent. Add this parent node to the list.
After the tree is completely built, then read one more bit. If this is not the last
(rightmost) bit of the byte, discard the remaining bits in the byte. The next four
424 Intermediate C Programming
bytes (an unsigned int) store the number of characters in the article. This number
is followed by a new line n character.
Consider the header in Section 24.2.6. The decoder reads one bit from the compressed
file. This bit is 1 and then the decoder reads another 7 bits. Fig. 24.12 to Fig. 24.15 show
how to reconstruct the tree from the header. The first character is ’h’. One tree node is
created and it is pointed to by one list node as shown in Fig. 24.12 (a). For decoding, as
long as the tree can be rebuilt, the frequencies of the characters are not needed. Fig. 24.12
(b) shows the list after reading the first two bytes. Fig. 24.12 (c) shows the list after reading
the first six bytes. In Fig. 24.12 (d), the first two tree nodes share the same parent. The
first tree node becomes the right child and the second tree node is the left child, making E
and N share a parent node. This is because the code book was encoded using a post-order
traversal. Please notice the symmetry between this figure and Fig. 24.3. In Fig. 24.12 (e),
this command is followed by 7 bits of data for the character G. Another tree node for G is
added to the list.
(a) (b)
(c)
(d)
(e)
FIGURE 24.12: (a) One tree node is added after reading the first command and the first
character. (b) After reading two bytes. (c) After reading six bytes. (d) The first bit in the
seventh byte is a command and it is 0. (e) The next command bit (the second bit in the
seventh byte) is 1.
..................Content has been hidden....................

You can't read the all page of ebook, please click here login for view all page.
Reset