/******************************************************************************* * * * HUF.C by Shaun Case April 1991 * * * * Written in Borland C++ 2.0 under MS-DOS 3.3 * * * * Uses Huffman encoding to compress a single file. * * This program is in the public domain. * * * * atman%ecst.csuchico.edu@RELAY.CS.NET * * * * * *******************************************************************************/ #include #include #include /* for storing filename in output file */ #define FALSE 0 /* i'm still deciding on a style... */ #define TRUE !FALSE /* * for lots of interesting (?) diagnostics, uncomment the following: #define VERBOSE * */ typedef struct chardata { /* template for leaves and nodes in tree */ short charnum; /* which character to be encoded */ /* don't want it lost in the sort */ unsigned long total; /* total occurances in the file */ short seq_length; /* length of this char's huffman code (bits)*/ short bit_sequence; /* the huffman code sequence for this char */ double frequency; /* frequency of occurance, < 1.0 */ struct chardata *up, *left, *right; /* pointers to rest of tree */ }; typedef struct decode_table_element { /* template for decode table element (wow) */ unsigned char letter; /* which character to decode to */ char spare; /* force 16-bit word alignment */ short left; /* index of lower left element from tree */ short right; /* index of lower right element from tree */ }; int compare(const void*, const void*); /* prototype for compare() for qsort() */ struct chardata *huftable[256]; /* array that points to leaves in tree */ struct chardata *root=NULL; /* future root of tree */ struct decode_table_element *decode_table; /* pointer to decode table */ short array_max_index=0; /* max number of elements in array (to be */ /* determined in create_decode_table() */ long real_bit_total=0L; double avg_bits_per_symbol=0; /* averages num of bits needed to represent */ unsigned long total; /* total number of unencoded bytes */ long real_bits_total = 0L; FILE *infile; /* file ptr to original file (uncompressed) */ FILE *outfile; /* file ptr to output fiel (compressed) */ char *infile_name; /* ptr to name of input file */ char *outfile_name; /* ptr to name of output file */ int main(int argc, char **argv) { #ifdef VERBOSE void show_encode_info(void); /* prototype */ void show_encoded_file_size(void); /* prototype */ void show_decode_table(void); /* prototype */ #endif void dumptable(void); /* prototype */ short create_tree(void); /* prototype */ void gen_bit_sequences(struct chardata *node); /* prototype */ short create_decode_table(void); /* prototype */ short compress(void); /* prototype */ register short c; /* a character */ if (argc != 3) { /* check command line arguments */ puts("'huf file1 file2' encodes file1 into file2."); return 1; } puts("Huf by Shaun Case, 1991, Public Domain"); infile_name = argv[1]; outfile_name = argv[2]; puts("Analyzing data."); for (c=0; c < 256; c++) /* initialize decode table */ { if ((huftable[c] = (struct chardata *)malloc(sizeof (struct chardata)))== NULL) { printf("Unable to allocate space for %dth huftable node.",c); return 1; } huftable[c]->charnum = c; /* need to know who we are after qsort() gets done with us */ huftable[c]->total = 0L; huftable[c]->frequency = 0; huftable[c]->up = NULL; huftable[c]->left = NULL; huftable[c]->right = NULL; } if ((infile=fopen(infile_name, "rb")) == NULL) /* open the input file */ { printf("Unable to open %s.\n", infile_name); return 1; } while (!feof(infile)) /* get character distribution data */ { c = fgetc(infile); if (feof(infile)) continue; ++total; ++huftable[c]->total; /* increment total for char c */ } if (total == 0L) /* handle empty file */ { puts("Input file is empty, aborting."); return 0; } fclose(infile); /* sort according to frequency of occurance */ qsort((void *)huftable, (size_t)256, (size_t)sizeof(struct chardata *), compare); dumptable(); /* fill huftable with distribution freqs */ if (create_tree() != TRUE) /* make the huffman tree (bit sequences) */ return 1; #ifdef VERBOSE printf("\nFreq of root is %f.\n",root->frequency); /* this should be 1.0 if all went well */ puts("\nBit sequences:\n\n"); #endif avg_bits_per_symbol = 0; gen_bit_sequences(root); /* build bit sequences, stuff seqs & */ /* lengths into associated leaves */ #ifdef VERBOSE printf("\n\nActual bits per symbol = %f", avg_bits_per_symbol); printf("\nActual final data size w/o table should be %f\n", avg_bits_per_symbol * (double)total / ((double) 8)); show_encoded_file_size(); #endif puts("Building decode table..."); if (create_decode_table() != TRUE) /* create decode array from tree */ { puts("Unable to allocate space for decode table, exiting."); return 1; } printf("Decode table built, contains %d elements.\n",array_max_index + 1); #ifdef VERBOSE show_decode_table(); show_encode_info(); #endif if (compress() != 0) /* now that we have all necessary info, */ return 1; /* perform the compression. */ puts("Done."); return 0; } /***************************************************************************** * this code is going to be a little strange -- we have an array of items * * that are going to be the leaves of the tree, and we have to go upward * * linking them together to make the root. For algorithm, see my notebook, * * or look up huffman compression in a good algorithms book, or see * * huffman's paper -- "A Method for the Construction of Minimum-Redundancy * * Codes" David A. Huffman, Proceedings of the IRE 40, 1098-- 1101 * * * * after the tree is built, the root of the tree is assigned to the global * * variable root. * * * * -- Shaun Case * *****************************************************************************/ struct chardata *ptr1, *ptr2; /* 1 = smallest freq, 2 = 2nd smallest */ short create_tree() { void find_lowest_freqs(void); /* prototype */ short only_one_up_ptr_left(void); /* prototype */ double maxfreq = 0 ; /* max combined freqency in the tree */ struct chardata *new_node = NULL; /* ptr to new node to be created */ puts("Creating tree from frequencies..."); while (maxfreq < 0.99999 ) /* while we haven't made the root yet */ { find_lowest_freqs(); /* find the two lowest combined or */ /* unused single frequencies */ if ( /* create a new node */ (new_node = (struct chardata *)malloc(sizeof (struct chardata)) ) == NULL ) { puts("Insufficient memory, malloc() failed in create_tree()."); return FALSE; } /* initialize the new node */ new_node->up = NULL; new_node->left = ptr2; new_node->right = ptr1; new_node->charnum = -1; /* if you get a lot of -1s when traversing the tree, */ /* you aren't looking at the leaves. */ ptr1->up = new_node; ptr2->up = new_node; new_node->frequency = ptr1->frequency + ptr2->frequency; /* combine 2 old freqs */ maxfreq = new_node->frequency; /* clever optimization */ #ifdef VERBOSE printf("Newly created freq == %f\n", maxfreq); #endif } root = new_node; /* update global variable */ if (only_one_up_ptr_left()) /* verify tree is correct */ { /* algorirthm seems to work */ puts("Done creating tree."); /* fine, this is a saftey */ #ifdef verbose /* check. */ puts("Win: apparently only one remaining up-pointer."); #endif } else { puts("Lose: apparently more than one remaining up-pointer."); return FALSE; } return TRUE; /* everything is fine. */ } /************************************************************ * verify there is only one root after the tree is created * * by making sure only one node has an up pointer that has * * the value NULL. * * * * Just a double-check. * * * ************************************************************/ short only_one_up_ptr_left(void) { short bottom=255; register struct chardata *tptr; register struct chardata *first_null_up_ptr=NULL; tptr=huftable[bottom]; while (bottom != -1) { while (tptr->up != NULL) /* find root for this leaf */ tptr = tptr->up; if (first_null_up_ptr == NULL) /* assign uptr to root, 1st time */ first_null_up_ptr = tptr; else if (first_null_up_ptr != tptr) /* uh oh, found another root.. */ return FALSE; --bottom; } return TRUE; } /**************************************************************** * * * Find the two smallest combined or unused single frequencies * * in the partially-constructed tree. * * * ****************************************************************/ void find_lowest_freqs(void) { register struct chardata *tptr; /* the following are local for speed (I think double indirection is slow */ register struct chardata *local_minptr; /* ptr to smallest unused freq */ register struct chardata *local_second_minptr; /* ptr to 2nd smallest unused freq */ struct chardata dummy; short bottom = 255; dummy.frequency = 2.0; /* initialize to bigger than 1.000 */ local_minptr=&dummy; /* initialize to bigger than 1.000 */ while(bottom != -1) /* find smallest "unused" frequency of all */ { tptr = huftable[bottom]; if (tptr->total == 0L) /* skip unused byte values (not all files */ { /* contain all 256 chars) */ --bottom; continue; } while (tptr->up != NULL) /* find top of tree ending in current leaf */ tptr = tptr->up; if (tptr->frequency < local_minptr->frequency) /* if current < min */ local_minptr = tptr; /* then min = current */ --bottom; /* keep going... */ } /* now find second smallest "unused" frequency */ bottom = 255; /* start at the bottom again */ local_second_minptr=&dummy; /* initialize to bigger than 1.000 */ while(bottom != -1) { tptr = huftable[bottom]; if (tptr->total == 0L) /* skip unused byte values */ { --bottom; continue; } while (tptr->up != NULL) /* find top of tree ending in current leaf */ tptr = tptr->up; if ( (tptr->frequency < local_second_minptr->frequency) /* if current < min2 */ && (tptr != local_minptr) /* and curr <> min1 */ ) local_second_minptr = tptr; /* then min2 = current */ --bottom; /* keep going... */ } ptr1 = local_minptr; ptr2 = local_second_minptr; } /******************************************************* * * * Dump the contents of the huffman table. * * also fills huftable with distribution frequencies * * * *******************************************************/ void dumptable(void) { register short i; int bits_per_char; double check_total = 0; double percent = 0; double frac; #ifdef VERBOSE puts("Totals:\n"); #endif #ifdef VERBOSE printf("\n\ntotal chars = %ld.\n", total ); #endif if (total==0) /* otherwise get divide by zero error */ return; avg_bits_per_symbol=0; for (i=0; i< (256); i++) { huftable[i]->frequency = frac = (((double)huftable[i]->total))/(double)total; percent= (double)(100)*frac; if (frac >0) bits_per_char = (int)ceil( -log10(frac)/log10((double)2)); #ifdef VERBOSE printf("\n%3d = %6ld = %%%6.3f => %d bits",huftable[i]->charnum, huftable[i]->total, percent, bits_per_char ); #endif check_total += percent; avg_bits_per_symbol += ((double)bits_per_char) * frac; } #ifdef VERBOSE printf("\n\nTotal........ %%%6.3f\n", check_total); printf("Average bits per symbol = %6.3f\n",avg_bits_per_symbol); printf("Total compressed file length w/o decode table = %f\n", avg_bits_per_symbol * (double)total / ((double) 8) ); #endif } /********************* * called by qsort() * *********************/ int compare(const void *arg1, const void *arg2) { long result = ( ( *(struct chardata **)arg1 )->total ) - ( ( *(struct chardata **)arg2 )->total ); /* return codes reveresed from normal for descending order */ if (result > 0L) return -1; /* first > second */ if (result < 0L) return +1; /* first < second */ return 0; /* first = second */ } /*************************************************************************** * floating point, backwards trees, dynamic memory allocation, * * and now recursion -- this program has it all! * * * * func builds bit sequences at stuffs associated sequence and length into * * corresponding leaf node * * * * Pretty obvious code, if you know recursion. * * * ***************************************************************************/ short seq_len=0; void gen_bit_sequences(struct chardata *node) { unsigned short asctobin(char *bit_seq); static char bit_sequence[16]; /* where to build the huffman bit sequence */ double frac; if (node->left != NULL) { bit_sequence[seq_len++]='1'; gen_bit_sequences(node->left); seq_len--; } if (node->right != NULL) { bit_sequence[seq_len++]='0'; gen_bit_sequences(node->right); seq_len--; } if (node->right != NULL) /* we are at an interior node going back up */ return; bit_sequence[seq_len] = 0; node->seq_length = (long)seq_len; node->bit_sequence = asctobin(bit_sequence); #ifdef VERBOSE printf("%3d == %16s == %4x %3d\n", node->charnum, bit_sequence, node->bit_sequence, seq_len); #endif frac = (((double)huftable[node->charnum]->total))/(double)total; avg_bits_per_symbol += ((double)seq_len) * frac; real_bits_total += (long)seq_len; } /********************************************* * turn an ASCII representation of a binary * * number into an unsigned short. * *********************************************/ unsigned short asctobin(char *bit_seq) { register short i; register unsigned short binary_value=0; for (i = 0; (i < 16) && (bit_seq[i]); i++) { binary_value <<= 1; binary_value |= (bit_seq[i]=='1') ? 1 : 0; } return binary_value; } /************************************************************ * build a decode table (an array) from the big fat hairy * * tree structure we created earlier. the decode table is * * parsed just like the tree, only instead of pointers to * * other nodes, there are left and right array indecies. * * This was done because the destination system for my * * application had no dynamic memory allocation. Also * * I didn't want to try writing a tree to disk, ick. * ************************************************************/ short create_decode_table(void) { void assign_index(struct chardata *node); void fill_array(struct chardata *node); assign_index(root); /* assign index nums to each node in tree -- use member 'total' for */ /* this purpose, since it isn't needed anymore. */ /* allocate space for array dynamically, since we don't know */ /* how big it is going to be. (will be small if there are */ /* only a few different characters that are used repeatedly */ /* in the file to be encoded. */ decode_table = (struct decode_table_element *) calloc(array_max_index + 1, sizeof(struct decode_table_element)); if (decode_table == NULL) return FALSE; fill_array(root); /* fill up the table */ return TRUE; } /*************************************************************************** * assign all nodes index numbers. uses preorder traversal to ensure that * * the root is assigned 0, which will be the 'root' of the array -- * * where the searching starts when decoding. * * sets value of global, "array_max_index" * ***************************************************************************/ void assign_index(struct chardata *node) { if (node != NULL) { node->total = (long)array_max_index++; assign_index(node->left); assign_index(node->right); } } /************************************************* * fill up the decode table * * use preorder traversal (see above for reason) * *************************************************/ short temp_index = 0; void fill_array(struct chardata *node) { if (node != NULL) { if (node->left == NULL) /* we found a leaf */ { decode_table[temp_index].letter = node->charnum; decode_table[temp_index].left = 0; decode_table[temp_index].right = 0; } else /* we are at an interior node */ { decode_table[temp_index].letter = '@'; /* if you get a lot of '@' in your decoded */ /* file you are looking at interior nodes, */ /* not leaves. */ decode_table[temp_index].left = (short) node->left->total; decode_table[temp_index].right = (short) node->right->total; } temp_index++; fill_array(node->left); fill_array(node->right); } } /*********************** * perform compression * ***********************/ short compress(void) { short output_bits(short bit_seq, short seq_len); /* prototype */ short find_match(int c); /* prototype */ register unsigned long i; /* index variable */ register short array_index; /* index to info for char */ int c; /* a character */ char orig_name[MAXFILE+MAXEXT], /* original filename */ drive[MAXDRIVE], /* needed for fnsplit() */ dir[MAXDIR], /* needed for fnsplit() */ filename[MAXFILE], /* 8 chars, orig name */ ext[MAXEXT]; /* 3 chars, orig ext */ if ((infile=fopen(infile_name, "rb")) == NULL) { printf("Unable to open %s\n", infile_name); return 1; } if ((outfile=fopen(outfile_name, "wb")) == NULL) { printf("Unable to open %s\n", outfile_name); return 1; } fnsplit(infile_name, drive, dir, filename, ext); /* get filename */ strcpy(orig_name, filename); strcat(orig_name, ext); for (i=0; i<13; i++) /* write orig filename */ fputc(orig_name[i], outfile); /* to file one char at a time */ /* write # of array elems */ if ( fwrite((void*)&array_max_index, sizeof(short), 1, outfile) < 1 ) { printf("Unable to write to %s -- out of disk space?", outfile_name); fclose(outfile); return 1; } /* write original length */ if ( fwrite((void*)&total, sizeof(unsigned long), 1, outfile) < 1 ) { printf("Unable to write to %s -- out of disk space?", outfile_name); fclose(outfile); return 1; } /* write decode table */ if ( fwrite((void*)decode_table, sizeof(struct decode_table_element), array_max_index + 1, outfile) < array_max_index ) { printf("Unable to write to %s -- out of disk space?", outfile_name); fclose(outfile); return 1; } printf("Encoding %ld bytes.\n",total); for (i=0; i< total; i++) /* for all bytes */ { c = fgetc(infile); /* get a new byte */ if (c == EOF) { printf("\nReached unexpected end of input file at position %ld.\n",i); return 1; } array_index = find_match(c); /* find it in the decode table */ if ( /* write out the bit sequence */ output_bits(huftable[array_index]->bit_sequence, (short)huftable[array_index]->seq_length) != 0 ) { printf("Unable to write to %s -- out of disk space?", outfile_name); return 1; } } output_bits(255,7); /* flush last partial char from buffer */ printf("%ld bytes encoded.\n", i); fclose(outfile); return 0; } /*************************************** * return array index that corresponds * * to a particular character * ***************************************/ short find_match(int c) { register short i=0; while (c != huftable[i]->charnum) ++i; return i; } /************************************ * write out a bit sequence to disk * ************************************/ short output_bits(short bit_seq, short seq_len) { static unsigned char buffer = 0; static unsigned short bits_used = 0; short i; bit_seq <<= (16 - seq_len); /* bring 1st real bit to leftmost position */ for (i = 0; i < seq_len; i++) { buffer <<= 1; /* make space for next bit in buffer */ buffer |= ((bit_seq & 0x8000) != 0); /* set rightmost bit of buffer to lefmost bit in sequence */ bit_seq <<= 1; /* throw away used bit from sequence */ bits_used++; /* increment bit count */ if (bits_used == 8) /* if we have a whole byte, write it. */ { if (fputc((int)buffer, outfile)==EOF) { fclose(outfile); return 1; } bits_used = 0; /* start fresh for new byte */ buffer = 0; /* may not be necessary except on final bits/byte */ } } return 0; } #ifdef VERBOSE void show_encode_info(void) { register short i; for (i=0; i<256; i++) printf("%3d. charnum =%3d index == %ld seq_len =%d, bit_seq = %x\n", i, huftable[i]->charnum, huftable[i]->total, huftable[i]->seq_length, huftable[i]->bit_sequence); } /****** Datafile: All 16/32 bit quantities in Intel format 13 bytes : original filename (8.3 + "\0") 16 bits : number of array elements needed, N (N == 511 means 512 array elements -> 0..511) 32 bits : size of uncompressed original data in bytes N * 6 bytes : Array elements in order 0 .. N struct decode_table_element { char letter; 8 bits char spare; 8 bits short left; 16 bits short right; 16 bits } : compressed data, effectively a bit stream ******/ void show_encoded_file_size(void) { register unsigned short i; double bit_total=0; for (i = 0; i < 256; i++) bit_total += (double)huftable[i]->total *(double)huftable[i]->seq_length; printf("Best guess at encoded file size is %f bytes.\n", bit_total / (double) 8); } void show_decode_table(void) { register unsigned short i; for (i=0; i