250 likes | 468 Views
Huffman Compression Project. In this project you are to implement the Huffman code algorithm as a file compression/uncompression utility. You will scan the input text file to be compressed and compute the frequencies of each character that appears in the file, as well as the
E N D
Huffman Compression Project In this project you are to implement the Huffman code algorithm as a file compression/uncompression utility. You will scan the input text file to be compressed and compute the frequencies of each character that appears in the file, as well as the count of distinct characters in the file. From this information you may build the Huffman code tree and thus get the code table needed for encoding the text. In order to enable decompression of the file by another party, you will include information as a header in the file (i.e., before the actual encoded text). This information starts with the count of distinct characters in the file. This value fits into a single byte and is written using fputc .
If the distinct character count is greater than one, it is written to the output file using putCharmethod of the BitOStream class. . It is followed by the an unsigned char which specifies the number bits in the last byte of the output file that will be valid. We then output the symbols on the leaves of the code tree in left-right order All the operations to this point are done using the BitOStream'sputChar. Then the representation of the code tree structure is output to the bitOStream (reptree). This is then followed by the character-by-character bit encoding of the text file.
symbolCount Leaf symbols . . . . . . . . . Bit encoded code tree Encoded text file symbolCount bytes bits in last byte Output as bytes Output as bits Compressed File Format
You will be provided with a main program file (prog4.cpp), a header file, prehuff.h, for the functions used in prog4.cp and an incomplete file called prehuff.cpp. The latter file is supposed to contain the code for the functions not supplied in the prog4.c file. The two critical functions are huff (for compression) and unhuff (for uncompression). Your job will be to complete the missing code in prehuff.c. You will use a priority queue to hold the forest.
We will now examine the contents of the above files. As usual, you are responsible for clearly understanding all of the functions in these files, not just the ones you are to complete.
// huffnode.h #pragma once structhuffnode { unsigned char sym; int weight; huffnode * left, *right; huffnode(unsigned char s, int w, huffnode *L=0, huffnode *R=0): sym(s), weight(w), left(L), right(R) {} }; typedefhuffnode * huffnodeptr;
// prehuff.h #pragma once #include <queue> #include <vector> #include <iostream> #include <fstream> #include <string> #include "BitIStream.h" #include "BitOStream.h" #include "huffnode.h" class mycomparison { boolreverse; public: mycomparison(constbool& revparam=false) {reverse=revparam;} booloperator() (consthuffnodeptr & lhs, consthuffnodeptr & rhs) const { if(reverse) return (lhs->weight < rhs->weight); elsereturn (lhs->weight > rhs->weight); } };
typedefpriority_queue<huffnodeptr,vector<huffnodeptr>,mycomparison> nodepq; // Utility function void rewind(ifstream & in); /* Reset in to the beginning of the file */ intgetFrequencies(vector<int> & freqs, ifstream & inText); /* Pre: inText has been opened and is at beginning of the file Post: freq[c] equals the number of occurrences of symbol c in inText Returns: the number of distinct characters in the file */ void showCodeTable(const vector<string> & codes); huffnodeptrbuildHuffmanTree(const vector<int> &freqs); /***************************************************************** Pre: freqs contains the frequencies of characters in a file Post: The optimal code tree for the frequencies has been constructed using the Huffman algorithm. Returns: A pointer to the root of the optimal code tree *****************************************************************/
void huff(string textFileName, string compFileName); // Compress text file to compressed file booldecodeText( BitIStream & in, ofstream & out, huffnodeptr root); /* Precondition: the BitIStream is at the beginning of a file produced by the huff function, the output file has just been opened for writing root is the root of the huffman code tree Postcondition: if true is returned, then the decoded text has been placed in the file out. Otherwise, insufficient bits in BitIStream */ void unhuff(string cmpFileName, string outFileName); /* Precondition: file cmpFileName was produced by the huff function Postcondition: file outFileName contains the uncompressed version of cmpFileName */
// prehuff.cpp #include "prehuff.h" #include "bitrep.h" #include <cassert> using namespace std; // Utility function void rewind(ifstream & in) { in.clear(); in.seekg(0,in.beg); } // Debugging tool void showCodeTable(const vector<string> &codes) { for(inti = 0; i < 256; i++) if (codes[i].length() >0) cout << i << ": " << codes[i] << endl; cout << endl; }
/****************** Compressing a file *********************/ void getCodeWords( huffnodeptr p, vector<string> & codewords, string path) { if (p->left == 0) { // Since the tree is a two-tree, *p is a leaf codewords[p->sym] = path; } else { getCodeWords(p->left,codewords,path+"0"); getCodeWords(p->right,codewords,path+"1"); } }
void getStats (vector<int> freqs, vector<string> codes, // input parameters • int &textsize, int &compsize, unsigned char &lastbits) //output parameters • { • vector<int> symbits(256,0); • intsymbolcount = 0, i, residue=0, bytes = 0; • textsize = 0; • for (i = 0; i< 256; i++) • if (freqs[i] > 0) • { • symbolcount++; • textsize += freqs[i]; • symbits[i] = freqs[i]*codes[i].length(); • bytes += symbits[i]/8; • residue += symbits[i]%8; • } • // now account for the bits of the tree representation • bytes += (2*symbolcount-1)/8; • residue += (2*symbolcount-1)%8; • bytes += residue/8; • residue = residue%8; • compsize = bytes; • if (residue > 0) • compsize++; • if (residue == 0) • residue = 8; • lastbits = (unsigned char) residue; • }
void showStats(inttextsize, intcompsize) • { • printf("\nNumber of bytes in source text file: %d\n",textsize); • printf("Number of bytes in compressed version of file: %d,\n",compsize); • printf("which is %.2lf percent of source text size.\n", ((double) compsize/textsize)*100); • } • intgetFrequencies(vector<int> & freqs, ifstream & inText) • { • char ch; • assert(freqs.size() == 256); • int sum = 0; • intsymbolcount = 0, i; • // YOU SUPPLY THE MISSING CODE HERE • return symbolcount; }
huffnodeptrbuildHuffmanTree(const vector<int> & freqs) • { • inti; • nodepq forest; • huffnodeptrleftP, rightP, tmp; • /* ----- FILL IN MISSING CODE -----*/ • // First, build the initial forest of one-node trees • // Now carry out the Huffman algorithm to build the code tree • while(forest.size() > 1) { • // remove the two least weighted roots in forest • // create a new tree using these trees according to the • // Huffman algorithm and insert the root pointer into forest • } • huffnodeptrretval = forest.top(); • forest.pop(); • return retval; • }
/* COMPRESSION (huff function): • Declare and initialize needed variables • Open text file and the BitOStream • Scan text file and compute frequencies and count (symbolCount) of distinct symbols • Cast symbolCount to an unsigned char and output using the BitOStreamputChar method • If the count is 1, then • put the unique unsigned character to the output stream • (BitOStreamputChar method) • write the frequency of that character to the file • (BitOStreamputInt method) • close the text file and the BitOStream • Construct huffman tree • Call getCodeWords to construct table of codewords
Compute the textsize, compressed file size, and the number of valid bits • in the last byte of the compressed file (getStats) • Use the BitOStreamputchar method to output the number of valid bits • in the last byte as an unsigned char. • Call putLeafLabels to write leaf symbols in left-right tree order to the file. • Call repTree to output tree structure. • Use the rewind function to prepare the text file for re-reading • Scan the text file, sending the codeword for each character • to the output file using putBitString function • Close the text file and BitOStream. • */
void huff(string textFileName, string compFileName) • { • ifstreaminText(textFileName.data(),ios::binary); • BitOStream out(compFileName); • inti, textsize, cmpsize, count; • unsigned char lastbits; • char c; • vector<int> frequencies(256,0); • vector<string> codewords(256,""); • huffnodeptr root; • string thePath(""); • count = getFrequencies(frequencies, inText); • if (count == 0){ • inText.close(); • out.close(); • return; • } • out.putChar((unsigned char) count);
if (count == 1) { • for (i = 0; i <= 255; i++) • if (frequencies[i] > 0) • break; /* This is the unique character in file */ • out.putChar((unsigned char) i); • out.putInt(frequencies[i]); • inText.close(); • out.close(); • return; • } • /* At least two distinct characters in the text file */ • /******** YOU FILL IN THE MISSING CODE HERE *********/ • inText.close(); • out.close(); • showStats (textsize,cmpsize); • }
booldecodeText( BitIStream & in, ofstream & out, huffnodeptr root) • { • char b; • huffnodeptr p = root; • while (in.getBit(b)) { • /* FILL IN THE MISSING CODE • input bits and move down the tree until a leaf • is found; at that point, output the leaf symbol • and reset p to the root • */ • } • if (p != root) { • cerr << "Error: unexpected end of bit output stream\n"; • return false; • } else • return true; • }
/* UNCOMPRESSION (unhuff function): • declare and initialize variables, including the output file and BitIStream • input the symbol count from the input stream (BitIStreamgetChar) • if the count is 1 • get the character from the input stream (BitIStreamgetChar) • read in the number of occurrences from the file (BitIStreamgetInt) • write the character to the output file the required number of times • close the BitIStream and the output file • return • input the number of valid bits in the last byte (BitOStreamgetChar) • input the leaf characters from the file and place in an array • input the huffman code tree, using recoverTree • use the array of leaf symbols and the root pointer to label the leaves of • the code tree • call decodeText to decode the input file • close the BitOStream and the output file • */
void unhuff(string cmpFileName, string outFileName) • { • /* Declare and initialize variables, including opening any files or streams */ • int m, count, i; • unsigned char c; • unsigned char bitsInLastByte, c1,c2; • vector<unsigned char> symArray(256); • huffnodeptr root; • unsigned char symbolCount; • ofstream out(outFileName.data()); • BitIStreamcompFile(cmpFileName); • /* input the symbol count from the input stream (fgetc) */ • compFile.getChar(symbolCount); • if (symbolCount == 1) { • /* FILL IN THE MISSING CODE */ • } else { • compFile.getChar(bitsInLastByte); • /* FILL IN THE MISSING CODE */ • out.close(); • compFile.close(); • } • cout << "File recovered from compressed version\n"; • }
/********** prog4.c **********/ • #include <iostream> • #include "prehuff.h" • using namespace std; • int main() • { • string textName, compressName; • cout << "\nFILE COMPRESSION PHASE.\n" • "\nEnter name of file to compress: "; • cin >> textName; • cout << "Enter name for compressed file: "; • cin >> compressName; • huff(textName,compressName); • cout << "\nCompression completed\n\n\n"; • cout << "FILE DECOMPRESSION PHASE.\n" • "\nEnter name of file to decompress: "; • cin >> compressName; • cout << "Enter name for decompressed text file: "; • cin >> textName; • unhuff(compressName,textName); • cout << "\nDecompression completed\n\n"; • return 0; • }
main: proj4.0 prehuff.oBitIStream.oBitOStream.obitrep.o • g++ main.0 prehuff.oBitIStream.oBitOStream.obitrep.o -o main • main.o:prehuff.h • g++ -c prog4.cpp • prehuff.o prehuff.cpp prehuff.hBitIStream.hBitOStream.hbitrep.hhuffnode.h • g++ -c prehuff.cpp • BitIStream.o: BitIStream.cpp bitIStream.h • g++ -c BitIStream.cpp • BitOStream.o: BitOStream.cpp bitOStream.h • g++ -c BitOStream.cpp • bitrep.o bitrep.cpp bitrep.hhuffnode.hBitIStream.hBitOStream.h • g++ -c bitrep.cpp