/*	This file is part of the software similarity tester SIM.
	Written by Dick Grune, Vrije Universiteit, Amsterdam.
*/

#include	"buff.h"
#include	"text.h"

extern char *calloc();

extern char options[];
extern int ntexts;
extern struct text *text;
extern int min_run_size;

static int hash_code();
static print_hash();

#define	N_HASH	10639			/* any suitable prime */

unsigned int *hash_table;		/* to be filled by malloc() */

/* to judge the quality of the hash code */
static tally_right = 0, tally_wrong = 0;
static tally_hash(), print_tally();

make_hash()	{
	unsigned int last[N_HASH];
	/*	last[i] is the index of the latest char with hash_code i,
		or 0 if there is none.
	*/
	int n;
	
	for (n = 0; n < N_HASH; n++)
		last[n] = 0;
	
	hash_table = (unsigned int *)
			calloc(text_length(), sizeof (unsigned int));
	if (options['x'])
		hash_table = 0;
	if (!hash_table)	{
		printf(">>> Not enough memory for the hash table, ");
		printf("this is going to take time!\n\n");
		return;
	}
	
	for (n = 0; n < ntexts; n++)	{
		struct text *txt = &text[n];
		unsigned int j;
		
		if(txt->tx_limit < min_run_size)
			continue;
		for (
			j = txt->tx_start;
			j < txt->tx_limit - min_run_size + 1;
			j++
		)	{
			int h = hash_code(&buff[j]);
			
			if (last[h])	{
				hash_table[last[h]] = j;
				if (options['h'])
					tally_hash(last[h], j);
			}
			last[h] = j;
		}
	}
	if (options['h'])
		print_tally();
	if (options['H'])
		print_hash();
}

static int
hash_code(p)
	char *p;
{
	/*	hash_code(p) returns the hash code of the min_run_size first
		characters starting at p; caller guarantees that there
		are at least min_run_size chars.
	*/
	int h = 0;
	int i;
	
	for (i = 0; i < min_run_size; i++)
		h = ((h << 1) + (*p++&0xff)) % N_HASH;
	return h;
}

static
print_hash()
{
	/* will not be called if hash_table == 0 */
	unsigned int i;
	
	for (i = 1; i < text_length(); i++)	{
		printf("%d: %c: ", i, buff[i]);
		printf("%u\n", hash_table[i]);
	}
}

static
tally_hash(i0, i1)
	unsigned int i0, i1;
{
	int i;
	
	for (i = 0; i < min_run_size; i++)	{
		if (buff[i0++] != buff[i1++])	{
			tally_wrong++;
			return;
		}
	}
	tally_right++;
}

static
print_tally()
{
	printf("Tally_right = %d, tally_wrong = %d, ",
		tally_right, tally_wrong);
	printf("hash code efficiency = %d%%\n",
		100 * tally_right / (tally_right + tally_wrong));
}

free_hash()	{
	if (hash_table)
		free((char *)hash_table);
}
