/*****************************************************************************/
/* File: huffman.c                                                           */
/* Author: David Chatenay                                                    */
/* Last Modified: Fri Oct 18 1996                                            */
/*                                                                           */
/* Implementation of the Huffman core. Long and nasty, heavy local memory use*/
/*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "common.h"
#include "crunch.h"
#include "buffer.h"
#include "header.h"


/*****************************************************************************/
/*****************************************************************************/
/*****************************************************************************/
/*                               MISCELLANOUS                                */
/*****************************************************************************/
/* DEFINES */
#define MIN_FILE_LEN 256
#define HEADER_CODE  0x8000
#define HEADER_STAT  0x4000
#define HEADER_TOPO  0x2000
#define HEADER_WLEN  0x1000


/* STRUCTURES DEFINITION */
/*** The cell (complex element of tree) */
typedef struct huffcell {
    dword n;    /* Frequency */
    dword on;   /* Original frequency (before quantization) */
    bool  leaf; /* Is a leaf? */
    union {
	struct {
	    Code  code;
	    byte  character;
	} leaf;  /* The leaf parameters */
	struct {
	    struct huffcell *left;
	    struct huffcell *right;
	} node;  /* The node parameters */
    } lon;
} HuffCell;
/*** Huffman vars (container of used vars) */
typedef struct {
    HuffCell  *root;       /* The root of the Huffman tree */
    HuffCell  chars[256];  /* The 256 characters */
    HuffCell  nodes[256];  /* The nodes (to avoid malloc) */
    HuffCell  *index[257]; /* The sorted index   */
    dword file_length; /* The original file length (before compression) */
    word  used_chars;  /* Number of used characters in the file */
    word  used_nodes;  /* Number of currently used nodes */
    word  max_depth;   /* Max code length */
    bool  stat;        /* Header contains stats? */
} HuffmanContext;
/*** File header (used for file i/o) */
typedef struct {
    word  flags;
    byte topo[32];
} FileHeader;



/*****************************************************************************/
/*****************************************************************************/
/*****************************************************************************/
/*                           LOW-LEVEL FUNCTIONS                             */
/*****************************************************************************/
/*** Initialisation of the header vars */
static void HuffmanInitContext(HuffmanContext *h)
{
    int i;

    DEBUG0("HuffmanInitContext:\n");
    for (i=0; i<256; i++) {
	h->chars[i].n = 0;
	h->chars[i].leaf = TRUE;
	h->chars[i].lon.leaf.character = (byte)i;
	h->chars[i].lon.leaf.code.code_len  = 0;
	(void)memset(h->chars[i].lon.leaf.code.code, 0, 16);
	h->index[i] = &(h->chars[i]);
    }

    h->root = NULL;
    h->index[256] = NULL;
    h->file_length = 0;
    h->used_chars = 0;
    h->used_nodes = 0;
    h->max_depth = 0;
    DEBUG0("Done.\n\n");
}


/*****************************************************************************/
/*** Read the file, and compute the stats */
static int ComputeStats(HuffmanContext *h, Buffer *in)
{
    dword m, m_2;
    int i;

    DEBUG0("ComputeStats:\n");

    /* Read and compute frequencies */
    DEBUG0("  [Read]\n");
    TORI(Seek(in->fd, 0, SEEK_SET));
    ReadBuffer(in);
    while (in->buff_length > 0) {
	for (i=0; i<in->buff_length; i++) 
	    h->chars[in->buffer[i]].n++;
	h->file_length += (dword)in->buff_length;
	ReadBuffer(in);
    }
    if (h->file_length < MIN_FILE_LEN) {
	ERROR("File too short to be compressed");
	return -1;
    }

    /* Quantization */
    DEBUG0("  [Quantization]\n");
    m = 0;
    for (i=0; i<256; i++) {
	h->chars[i].on = h->chars[i].n;
	if (h->chars[i].n > 0)
	    if (m == 0 || h->chars[i].n < m)
		m = h->chars[i].n;
    }
    if (m > 1) {
	DEBUG1("  Step: %ld\n", m);
	m_2 = m >> 1;
	for (i=0; i<256; i++) {
	    if ((h->chars[i].n % m) > m_2)
		h->chars[i].n = (h->chars[i].n / m) + 1;
	    else
		h->chars[i].n = h->chars[i].n / m;
	}
    }
	
    /* Compute #chars used */
    for (i=0; i<256; i++)
	if (h->index[i]->n != 0)
	    h->used_chars++;

    /* Display values */
    DEBUG1("  File length: %ld\n", h->file_length);
    DEBUG1("  Used chars: %d\n", h->used_chars);

    DEBUG0("Done.\n\n");
    return 0;
}


/*****************************************************************************/
    /* Subroutine for buildtree */
    static HuffCell *NewCell(HuffmanContext *h, dword n, HuffCell *l, HuffCell *r)
    {
	if (h->used_nodes < 256) {
	    HuffCell *p;
	    
	    p = &(h->nodes[h->used_nodes]);
	    p->n = n;
	    p->leaf = FALSE;
	    p->lon.node.left = l;
	    p->lon.node.right = r;
	    h->used_nodes++;
	    return p;
	} else {
	    ERROR("Bug in module huffman.c: No more free nodes!");
	    exit(-1);
	}
    }
    static void FindDepth(word *md, HuffCell *r, word d)
    {
	if (d > *md)
	  *md = d;
	if (!r->leaf) {
	    if (r->lon.node.left)
	      FindDepth(md, r->lon.node.left, d+1);
	    if (r->lon.node.right)
	      FindDepth(md, r->lon.node.right, d+1);
	}
    }
/*** Build the Huffman tree with computed stats */
static void BuildTree(HuffmanContext *h)
{
    HuffCell  **index, *new;
    int i, j;
    dword n;

    DEBUG0("BuildTree:\n");

    /* Sort */
    DEBUG0("  [Sort]\n");
    for (i=1; i<256; i++) {
        j = i;
        while (j != 0 && h->index[j]->n > h->index[j-1]->n) {
            /* Inversion of the cells */
            new = h->index[j-1];
            h->index[j-1] = h->index[j];
            h->index[j] = new;
            j--;
        }
    }

    /* Build the tree */
    DEBUG0("  [Build]\n");
    index = h->index;
    for (i=h->used_chars-1; i>0; i--) {
	/* Concatenate the last two cells */
	n = index[i]->n + index[i-1]->n;
	new = index[i+1] = NewCell(h, n, index[i], index[i-1]);
	/* Re-sort the array (insertion) */
	j = i + 1;
	while (j != 0 && index[j]->n > index[j-1]->n) {
	    /* Swap */
	    new = index[j-1];
	    index[j-1] = index[j];
	    index[j] = new;
	    j--;
	}
    }
    h->root = index[0];

    /* Time for fun: find max depth */
    DEBUG0("  [FindDepth]\n");
    FindDepth(&(h->max_depth), h->root, 0);
    DEBUG1("  Depth: %d\n", h->max_depth);

    DEBUG0("Done.\n\n");
}



/*****************************************************************************/
    /* Subroutines for ComputeCodes */
    static void RecursiveCode(HuffCell *p, Code c)
    {
	if (p->leaf) {
	    (void)memcpy(p->lon.leaf.code.code, c.code, 16);
	    p->lon.leaf.code.code_len = c.code_len;
	} else {
	    int b, n, l;

	    l = c.code_len;
	    c.code_len++;
	    b = l / 8;
	    n = l % 8;
	    if (p->lon.node.left) {
		c.code[b] &= XMask[n];
		RecursiveCode(p->lon.node.left, c);
	    }
	    if (p->lon.node.right) {
		c.code[b] |= RMask[n];
		RecursiveCode(p->lon.node.right, c);
	    }
	}
    }
/*** Compute huffman codes */
static void ComputeCodes(HuffmanContext *h)
{
    Code c;

    DEBUG0("ComputeCodes:\n");
    c.code_len = 0;
    RecursiveCode(h->root, c);
    DEBUG0("Done.\n\n");
}


/*****************************************************************************/
    /* Subroutines for OutputHeader and Encode */
/*** Output the header in compressed form to fd */
static int OutputHeader(HuffmanContext *h, Buffer *out)
{
    bool stat=false, topo=false, len=false;
    dword head_s, head_s_l, head_c_l, max;
    word stat_max_len, code_max_len, n;
    word stat_len_len, code_len_len;
    FileHeader fh;
    int i, j, l;
    Code head;

    DEBUG0("OutputHeader:\n");

    /* Init data */
    for (i=0; i<32; i++)
	fh.topo[i] = 0x00;
    fh.flags = 0x0000;

    /* Loop to calculate all cases of header,  */
    /* and choose the shortest one... Computes */
    /* the topological code doing this...      */
    /*** First, compute lengths */
    code_max_len = h->max_depth;
    code_len_len = CodeLength(code_max_len);
    DEBUG2("  code_len=%d on %d bits\n", code_max_len, code_len_len);
    max = 0;
    for (i=0; i<256; i++)
	if (h->chars[i].n > max)
	    max = h->chars[i].n;
    stat_max_len = CodeLength(max);
    stat_len_len = CodeLength(stat_max_len);
    DEBUG2("  stat_len=%d on %d bits\n", stat_max_len, stat_len_len);
    /*** Next, compute header lengths */
    head_s = h->used_chars * stat_max_len;
    head_s_l = head_c_l = 0;
    for (i=0; i<256; i++)
	if (h->chars[i].n) {
	    head_c_l += code_len_len + h->chars[i].lon.leaf.code.code_len;
	    head_s_l += stat_len_len + CodeLength(h->chars[i].n);
	}
    /*** Display values */
    DEBUG0("  Header possible lengths:\n");
    DEBUG1("    -Stats w/o length: %ld bits\n", head_s);
    DEBUG1("    -Stats w/ length : %ld bits\n", head_s_l);
    DEBUG1("    -Codes w/ length : %ld bits\n", head_c_l);

    /* Serious things: output the header */
    ResetBuffer(out);
    out->byte_count = 2;
    if (head_s < head_s_l && head_s < head_c_l) {
	/*** Output stats w/o length */
	stat = true; len = false;
	if ((head_s+32*8) < (head_s+(256-h->used_chars)*stat_max_len))
	    topo = true;
	else
	    topo = false;
    } else {
	if (head_s_l < head_s && head_s_l < head_c_l) {
	    /*** Output stats w/ length */
	    stat = true; len = true;
	    if ((head_s_l+32*8) < (head_s_l+(256-h->used_chars)*stat_len_len))
	        topo = true;
	    else
	        topo = false;
	} else {
	    /*** Output codes w/ length */
	    stat = false; len = true;
	    if ((head_c_l+32*8) < (head_c_l+(256-h->used_chars)*code_len_len))
	        topo = true;
	    else
	        topo = true;
	}
    }
    DEBUG0("  Header: ");
    if (stat) DEBUG0("Stat "); else DEBUG0("Code ");
    if (len) DEBUG0("Length ");
    if (topo) DEBUG0("Topo");
    DEBUG0("\n");
    if (stat)  fh.flags |= HEADER_STAT;  /* w/ stats  */
      else     fh.flags |= HEADER_CODE; /* w/ codes  */
    if (len)   fh.flags |= HEADER_WLEN;  /* w/ length */
      else     fh.flags &= ~HEADER_WLEN; /* w/o len   */
    if (topo)  fh.flags |= HEADER_TOPO;  /* w/ topo   */
      else     fh.flags &= ~HEADER_TOPO; /* w/o topo  */
    if (stat) {
	if (len)  fh.flags |= stat_len_len;
	  else    fh.flags |= stat_max_len;
    } else
	fh.flags |= code_len_len;
    out->buffer[0] = (fh.flags) & 0xFF;
    out->buffer[1] = (fh.flags >> 8) & 0xFF;
    if (topo) {
	n = 0;
	for (i=0; i<256; i++) {
	    if (h->chars[i].n != 0)
		out->buffer[out->byte_count+n] |= UMask[i % 8];
	    if ((i+1)%8 == 0)  n++;
	}
	out->byte_count += 32;
    }

    if (stat) {
	n = (stat_max_len / 8) + ((stat_max_len % 8) ? 1 : 0);
	for (i=0; i<256; i++)
	    if (!topo || h->chars[i].n != 0) {
		if (len) {
		    head.code_len = stat_len_len;
		    l = head.code[0] = CodeLength(h->chars[i].n);
		    TORI(AddCode(&head, out));
		    head.code_len = (word)l;
		} else
		    head.code_len = stat_max_len;
		if (h->chars[i].n != 0) {
		    max = h->chars[i].n;
		    for (j=0; j<(int)n; j++) {
		        head.code[j] = (max & 0xFF);
			max = max >> 8;
		    }
		} else
		    for (j=0; j<(int)n; j++)
		        head.code[j] = 0x00;
		TORI(AddCode(&head, out));
	    }
    } else {
	n = (code_max_len / 8) + ((code_max_len % 8) ? 1 : 0);
	for (i=0; i<256; i++)
	    if (!topo || h->chars[i].n != 0) {
		head.code_len = code_len_len;
		head.code[0] = (byte)h->chars[i].lon.leaf.code.code_len;
		TORI(AddCode(&head, out));
		TORI(AddCode(&(h->chars[i].lon.leaf.code), out));
	    }
    }
    DEBUG2("  [%d, %d]\n", out->byte_count, out->bit_count);
    DEBUG0("Done.\n\n");
    return 0;
}


/*****************************************************************************/
/*** Encode fdi to fdo, using the Huffman tree */
static int Encode(HuffmanContext *h, Buffer *in, Buffer *out)
{
    int i;

    DEBUG0("Encode:\n");
    TORI(Seek(in->fd, 0, SEEK_SET));
    /* Reset buffer */
    in->total = 0;
    ReadBuffer(in);
    while (in->buff_length > 0) {
	for (i=0; i<in->buff_length; i++)
	    TORI(AddCode(&(h->chars[in->buffer[i]].lon.leaf.code), out));
	ReadBuffer(in);
    }
    FlushBuffer(out);

    DEBUG0("Done.\n\n");
    return 0;
}


/*****************************************************************************/
/*****************************************************************************/
/*** Read header in file */
static int ReadHuffHeader(HuffmanContext *h, Buffer *in)
{
    bool stat=false, len=false, topo=false;
    int i, j, m=0, n;
    byte buffer[64];
    FileHeader fh;
    Code head;
    word code_len;

    DEBUG0("ReadHuffHeader:\n");
    /* Read header and understand it */
    TORI(ReadAbs(in->fd, buffer, 2));
    DEBUG0("  Header: ");
    fh.flags = buffer[0] + (buffer[1] << 8);
    if (fh.flags & HEADER_STAT) {
	DEBUG0("Stat ");
	stat = h->stat = true;
    } else
        if (fh.flags & HEADER_CODE) {
	    DEBUG0("Code ");
	    h->stat = false;
	} else {
	    ERROR("Not encoded with this lib!");
	    return -1;
	}
    if (fh.flags & HEADER_WLEN) {
        DEBUG0("Length ");
	len = true;
    }
    if (fh.flags & HEADER_TOPO) {
        DEBUG0("Topo ");
	topo = true;
    }
    code_len = (fh.flags & 0xFF);
    DEBUG1("  Code length coded on %d bits\n", code_len);

    /* Read Topo */
    if (topo)
        TORI(ReadAbs(in->fd, fh.topo, 32));

    /* Read records */
    ResetBuffer(in);
    ReadBuffer(in);
    if (stat && !len)
	m = (code_len / 8) + ((code_len % 8) ? 1 : 0);
    for (i=0; i<256; i++) {
	if (!topo || (fh.topo[i/8] & UMask[i%8])) {
	    if (stat) {
		head.code_len = code_len;
		if (len) {
		    TORI(ReadCode(&head, in));
		    head.code_len = head.code[0];
		    n = (head.code_len / 8) + ((head.code_len % 8) ? 1 : 0);
		} else
		    n = m;
		if (head.code_len != 0) {
		    TORI(ReadCode(&head, in));
		    for (j=n-1; j>=0; j--) {
			h->chars[i].n <<= 8;
			h->chars[i].n |= head.code[j];
		    }
		    h->chars[i].on = h->chars[i].n;
		    h->used_chars++;
		}
	    } else {
		head.code_len = code_len;
		TORI(ReadCode(&head, in));
		h->chars[i].lon.leaf.code.code_len = head.code[0];
		if (head.code[0] != 0)
		    TORI(ReadCode(&(h->chars[i].lon.leaf.code), in));
	    }
	} else
	    h->chars[i].n = 0;
    }
    DEBUG2("  [%d, %d]\n", in->byte_count, in->bit_count);

    DEBUG0("Done.\n\n");
    return 0;
}


/*****************************************************************************/
    /* Subroutines for RebuildTree */
    static HuffCell *NewNode(HuffmanContext *h)
    {
	if (h->used_nodes < 256) {
	    HuffCell *p;
	    
	    p = &(h->nodes[h->used_nodes]);
	    p->leaf = FALSE;
	    p->lon.node.left = p->lon.node.right = NULL;
	    h->used_nodes++;
	    return p;
	} else {
	    ERROR("Bug in module huffman.c: No more free nodes!");
            exit(-1);
        }
    }
/*** Rebuild tree with read data */
static void RebuildTree(HuffmanContext *h)
{
    Code *current;
    HuffCell *node;

    DEBUG0("RebuildTree:\n");
    if (h->stat) {
	/* Nearly nothing to do, except... call BuildTree! */
	BuildTree(h);
    } else {
	/* More complicated: we need to   */
	/* rebuild the intermediate nodes */
	dword j;
	int i;
	
	h->root = NewNode(h);
	for (i=0; i<256; i++) {
	    current = &(h->chars[i].lon.leaf.code);
	    if (current->code_len != 0) {
		node = h->root;
		for (j=0; j<(dword)(current->code_len-1); j++)
		  if (current->code[j/8] & UMask[j%8]) {
		      /* Goto right branch */
		      if (node->lon.node.right == NULL)
			node->lon.node.right = NewNode(h);
		      node = node->lon.node.right;
		  } else {
			/* Goto left branch */
			if (node->lon.node.left == NULL)
			    node->lon.node.left = NewNode(h);
			node = node->lon.node.left;
		    }
		j = current->code_len-1;
		if (current->code[j/8] & UMask[j%8])
		    node->lon.node.right = &(h->chars[i]);
		else
		    node->lon.node.left = &(h->chars[i]);
	    }
	}
    }
    DEBUG0("Done.\n\n");
}


/*****************************************************************************/
/*** Recompute codes (if necessary) */
static void RecomputeCodes(HuffmanContext *h)
{
    DEBUG0("RecomputeCodes:\n");
    if (h->stat) {
	/* Nearly nothing to do, except... call ComputeCodes! */
	ComputeCodes(h);
    }
    /* Nothing to do when codes are in the header! */
    DEBUG0("Done.\n\n");
}


/*****************************************************************************/
/*** Decode file */
static int Decode(HuffmanContext *h, Buffer *in, Buffer *out, dword file_length)
{
    HuffCell *current;
    dword decoded=0;
    byte *buffer;

    DEBUG0("Decode:\n");
    /* This is really fastidious: we must explore  */
    /* the tree until we reach a leaf, reading the */
    /* input file bit by bit (slow... really slow) */
    buffer = in->buffer;
    while (decoded < file_length) {
	current = h->root;
	while (!current->leaf) {
	    if (buffer[in->byte_count] & RMask[in->bit_count])
	        current = current->lon.node.right;
	    else
	        current = current->lon.node.left;
	    in->bit_count++;
	    if (in->bit_count == 8) {
	        IncrCountI(in);
		in->bit_count = 0;
	    }
	}
	Put(out, current->lon.leaf.character);
	decoded++;
    }
    FlushBuffer(out);

    DEBUG0("Done.\n\n");
    return 0;
}




/*****************************************************************************/
/*****************************************************************************/
/*****************************************************************************/
/*                          HIGH LEVEL FUNCTIONS                             */
/*****************************************************************************/
/* HUFFMAN ENCODING */
int HuffmanEncode(int in, int out, float *cr)
{
    Buffer *bin, *bout;
    CrunchHeader head;
    HuffmanContext hh;
    int r;

    /* Header write */
    TORI(ComputeHeader(in, &head, METHOD_HUFFMAN));
    TORI(WriteHeader(out, &head));

    /* Allocate buffers */
    bin = NewBuffer(in);
    bout = NewBuffer(out);

    /* Encoding sequence */
    HuffmanInitContext(&hh);
    if (ComputeStats(&hh, bin) == -1) {
	KillBuffer(bin);
	KillBuffer(bout);
	return -1;
    }
    BuildTree(&hh);
    ComputeCodes(&hh);
    if (OutputHeader(&hh, bout) == 0) {
	r = Encode(&hh, bin, bout);
    } else
        r = -1;

    /* Write compressed length */
    TORI(WriteLength(&head, bin, bout));

    /* Compute compression ratio */
    *cr = ((float)bin->total - (float)bout->total) / (float)bin->total;

    /* Free buffers */
    KillBuffer(bin);
    KillBuffer(bout);

    return r;
}


/* HUFFMAN DECODING */
int HuffmanDecode(int in, int out)
{
    Buffer *bin, *bout;
    CrunchHeader head;
    HuffmanContext hh;
    int r;

    /* Read header */
    TORI(ReadHeader(in, &head));
    if (CheckFileHeader(in, &head) != METHOD_HUFFMAN)
        return -1;
    TORI(SeekRealPosition(in, &head, 0));

    /* Allocate buffers */
    bin = NewBuffer(in);
    bout = NewBuffer(out);

    /* Decoding sequence */
    HuffmanInitContext(&hh);
    if (ReadHuffHeader(&hh, bin) == -1) {
	KillBuffer(bin);
	KillBuffer(bout);
	return -1;
    }
    RebuildTree(&hh);
    RecomputeCodes(&hh);
    r = Decode(&hh, bin, bout, head.OriginalLength);

    /* Free buffers */
    KillBuffer(bin);
    KillBuffer(bout);

    return r;
}
