/*
 * $Header: /u1/src/rfmail/RCS/hashgen.c,v 0.5 1992/05/18 04:27:24 pgd Exp pgd $
 *
 * $Log: hashgen.c,v $
 * Revision 0.5  1992/05/18  04:27:24  pgd
 * New distribution
 *
 * Revision 0.4.1.6  1992/03/15  07:58:52  pgd
 * Untested version
 *
 * Revision 0.4  1991/05/08  04:23:43  pgd
 * Initial Beta-release
 *
 */

/*
  Hash function generator

  Takes set of strings and generates a hash function which generates unique
  hash value for each of them. Also optionally generates #defines for
  each entry, a table of strings indexed using hash value and optionally hash
  function which will check hash the string agaist the table. 

  Files created are .c file for hash function and the table declaration and .h
  file for #defines and extern declarations.

  Copyright 1990 Heikki Suonsivu
  All rights reserved.

  You have right to use this piece of code for any purpose in condition you
  offer me a beef & beer when we see some day.

  This uses stupid iterative method to find out table without conflicts. It is
  quick enough anyway.
  */
  
#include <stdio.h>
#include "conf.h"
#include "config.h"

#ifdef linux
#include <malloc.h>
#endif

#ifdef HAVE_STRING_H
#include <string.h>
#else
#include <strings.h>
#endif

#ifdef HAVE_SYSEXITS
#include <sysexits.h>
#endif

#include <errno.h>

extern int errno;
extern char *sys_errlist[];

#undef strsave
#define strsave(s) (strcpy(malloc(strlen(s) + 2), s))

#ifndef TRUE
#define TRUE 1
#endif
#ifndef FALSE
#define FALSE 0
#endif

#define STRATEGY_ERROR -9999
#define STRATEGY_UNLIMITED 0
#define STRATEGY_UNLIMITED_XOR -1
#define STRATEGY_UNLIMITED_LOWBITS -2
#define MAX_LIMITED_STRATEGY 40
#define MAX_LIMITED_SHIFTS 8
#define MAX_HASH_SIZE 65536

#ifndef __STDC__
extern char *malloc();
#endif

char *s[MAX_HASH_SIZE];
char *hashed[MAX_HASH_SIZE];

/* Masks for different number of bits shifted. */
unsigned int mask[8] = { 0, 1, 3, 7, 15, 31, 63, 127 };

static void print_string_table_entry(output, string, mismatch_nulls, last)
     FILE *output;
     char *string;
     int mismatch_nulls, last;
{
  if (string)
    fprintf(output, "  \"%s\"", string);
  else
    {
      if (mismatch_nulls)
	fprintf(output, "  NULL");
      else
	fprintf(output, "  \"\"");
    }
  if (!last) putc(',', output);
  putc('\n', output);
}

void
printusage(fp)
     FILE *fp;
{
  fprintf(fp, "hashgen: generate hash function and hash table for input %s",
	  "strings\n");
  fprintf(fp, "-v  increase verbosity\n");
  fprintf(fp, "-i <name of the input file> default stdin\n");
  fprintf(fp, "-o <name of the output file> default stdout. '.c' will be %s",
	  "added\n");
  fprintf(fp, "-s <hash name> hash function and table name, default 'hash'\n");
  fprintf(fp, "-d  generate defines for input strings, uppercased and %s",
	  "values according\n");
  fprintf(fp, "    to hash table. File name will be output file name with %s",
	  "'.h' added, or\n");
  fprintf(fp, "-z <string> head defines with <string>\n");
  fprintf(fp, "    if one wasn't specified, hash.h\n");
  fprintf(fp, "-t  generate char *table for hashed strings\n");
  fprintf(fp, "-n  string table no-match-entries are NULL, not empty %s",
	  "strings.\n");
  fprintf(fp, "-c  Generate hash function to look up hash table to\n");
  fprintf(fp, "    check that string really matches. Will force -t.\n");
  fprintf(fp, "    generated hash function will return -1 for error.\n");
  fprintf(fp, "-h  you are looking at the output of this option\n");
  fprintf(fp, "hash function will be named to %%s_function, hash table to\n");
  fprintf(fp, "%%s_table.\n");
}

int
main(argc, argv)
     int argc;
     char *argv[];
{
  extern char *optarg;
  char buf[BUFSIZ], *hashname, *inputfile, outputfile[128], *define_head;
  register char *p;
  register int strings, hashsize, count, c, verbose, defines, strategy;
  register int bitmask, nbits, lowbits, mismatch_nulls, docheck;
  register FILE *input, *output;
  int hashtable;

  define_head = "";
  inputfile = NULL;
  input = stdin;
  *outputfile = 0;
  output = stdout;
  hashname = "hash";
  mismatch_nulls = FALSE;
  verbose = 0;
  defines = FALSE;
  hashtable = FALSE;
  docheck = FALSE;
  
  while ((c = getopt(argc, argv, "vi:o:s:dhtz:c")) != EOF)
    {
      switch (c)
	{
	 case 'v':
	  verbose++;
	  break;

	 case 'i':
	  inputfile = optarg;
	  if ((input = fopen(inputfile, "r")) == NULL)
	    {
	      fprintf(stderr,
		      "panic: Could not open input file %s, errno %d:\n%s\n", 
		      inputfile, errno, sys_errlist[errno]);
	      exit(EX_USAGE);
	    }
	  break;

	 case 'o':
	  (void) strcpy(outputfile, optarg);
	  if (!strchr(outputfile, '.')) (void) strcat(outputfile, ".c");
	  
	  if ((output = fopen(outputfile, "w+")) == NULL)
	    {
	      fprintf(stderr,
		      "panic: Could not open output file %s, errno %d:\n%s\n",
		      outputfile, errno, sys_errlist[errno]);
	      exit(EX_USAGE);
	    }
	  break;

	 case 's':
	  hashname = optarg;
	  break;

	 case 'd':
	  defines = !defines;
	  break;

	 case 't':
	  hashtable = !hashtable;
	  break;

	 case 'c':
	  docheck = TRUE;
	  hashtable = TRUE;
	  break;
	  
	 case 'n':
	  mismatch_nulls = !mismatch_nulls;
	  break;
	  
	 case 'z':
	  define_head = optarg;
	  break;
	  
	 case 'h':
	  printusage(stdout);
	  exit(EX_OK);

	 default:
	  printusage(stderr);
	  exit(EX_USAGE);
	}
    }
  
  /* Read input strings */
  strings = 0;
  while (fgets(buf, BUFSIZ, input))
    {
      if (p = strchr(buf, '\n')) *p = 0;
      
      if (strings >= MAX_HASH_SIZE)
	{
	  fprintf(stderr, "panic: too many strings\n");
	  exit(EX_DATAERR);
	}

      if (!*buf)
	{
	  fprintf(stderr, "panic: Cannot handle empty strings\n");
	  exit(EX_DATAERR);
	}
      
      s[strings++] = strsave(buf);
    }

  if (!strings)
    {
      fprintf(stderr, "panic: no input?\n");
      exit(EX_DATAERR);
    }

  /* For ideal input, hashsize is same as stringsize. However, we want to use
     base-2 hash tables for efficiency. */

  for (hashsize = 1, nbits = 1;
       hashsize < strings;
       hashsize = hashsize + hashsize, nbits++);

  /* Loop increasing hash size until we find reasonable hash table for
     modulus. */

  strategy = STRATEGY_ERROR;
  while (hashsize < MAX_HASH_SIZE)
    {
      register int hash;
      register int failed;

      if (verbose > 2)
	fprintf(stderr, "%d strings, hashsize %d, waste %d%%\n",
		strings, hashsize, (hashsize - strings) * 100 / hashsize);
	  
      /* First try simple hash, modulus of the sum of the characters */

      strategy = STRATEGY_UNLIMITED;
      failed = FALSE;
      for (count = 0; count < hashsize; count++) hashed[count] = NULL;
      for (count = 0; count < strings; count++)
	{
	  hash = 0;
	  for (p = s[count]; *p; p++)
	    hash += (unsigned int) *p;
	  hash = hash % hashsize;
	  
	  if (hashed[hash])
	    {
	      if (verbose > 3)
		fprintf(stderr, "unlimited conflict '%s' '%s', hash = %d\n",
			s[count], hashed[hash], hash);
	      failed = TRUE;
	      break;
	    }
	  hashed[hash] = s[count];
	}

      if (!failed)
	break;

      /* Then try xor instead of add. Works for small tables only. */

      strategy = STRATEGY_UNLIMITED_XOR;
      failed = FALSE;
      for (count = 0; count < hashsize; count++) hashed[count] = NULL;
      for (count = 0; count < strings; count++)
	{
	  hash = 0;
	  for (p = s[count]; *p; p++)
	    hash += hash ^ (unsigned int) *p;
	  hash = hash % hashsize;
	  
	  if (hashed[hash])
	    {
	      if (verbose > 3)
		fprintf(stderr, "unlimited xor conflict '%s' '%s', %s %d\n",
			s[count], hashed[hash], "hash =", hash);
	      failed = TRUE;
	      break;
	    }
	  hashed[hash] = s[count];
	}

      if (!failed)
	break;

      /* Then try lowest bits */

      strategy = STRATEGY_UNLIMITED_LOWBITS;
      for (lowbits = 1; lowbits < 8; lowbits++)
	{
	  failed = FALSE;
	  for (count = 0; count < hashsize; count++) hashed[count] = NULL;
	  for (count = 0; count < strings; count++)
	    {
	      hash = 0;
	      for (p = s[count]; *p; p++)
		hash += (hash << lowbits)
		  | ((unsigned int) *p & mask[lowbits]);
	      hash = (unsigned int) hash % (unsigned int) hashsize;
	      
	      if (hashed[hash])
		{
		  if (verbose > 3)
		    fprintf(stderr, "lowbits %d conflict '%s' '%s', %s %d\n",
			    lowbits, s[count], hashed[hash], "hash =", hash);
		  failed = TRUE;
		  break;
		}
	      hashed[hash] = s[count];
	    }
	  
	  if (!failed)
	    break;
	}
      if (!failed)
	break;

      /* Then try limiting number of characters to be summed */
      
      for (strategy = 1; strategy < MAX_LIMITED_STRATEGY; strategy++)
	{
	  if (verbose > 3)
	    fprintf(stderr, "Limited strategy %d\n", strategy);
	  
	  failed = FALSE;
	  for (count = 0; count < hashsize; count++) hashed[count] = NULL;
	  for (count = 0; count < strings; count++)
	    {
	      register int limiter;
	      
	      hash = 0;
	      for (p = s[count], limiter = 0;
		   *p && limiter < strategy;
		   p++, limiter++)
		hash += (unsigned int) *p;
	      hash = hash % hashsize;

	      if (hashed[hash])
		{
		  if (verbose > 3)
		    fprintf(stderr, "Limited %d: %s '%s' '%s', hash = %d\n",
			    strategy, "conflict", s[count], hashed[hash],
			    hash);
		  failed = TRUE;
		  break;
		}
	      hashed[hash] = s[count];
	    }
	  
	  if (!failed)
	    break;
	}

      /* If all strings fitted in the table, we are done */
      
      if (!failed)
	break;

      hashsize += hashsize;
    }

  if (hashsize == MAX_HASH_SIZE)
    {
      fprintf(stderr, "Couldn't find good hash table for input\n");
      exit(EX_DATAERR);
    }

  /* Produce piece of C code and hash function for it */

  if (verbose)
    {
      fprintf(stderr, "Could produce a hash table of %d strings,\n", hashsize);
      if (strategy == STRATEGY_UNLIMITED)
	fprintf(stderr, "using unlimited strategy,\n");
      else if (strategy == STRATEGY_UNLIMITED_XOR)
	fprintf(stderr, "using unlimited xor strategy,\n");
      else if (strategy == STRATEGY_UNLIMITED_LOWBITS)
	fprintf(stderr, "using unlimited low bit strategy (%d bits),\n",
		lowbits);
      else
	fprintf(stderr, "using strategy limiting number of %s to %d,\n",
		"characters summed", strategy);
      fprintf(stderr, "for original strings of %d, waste being %d%%.\n",
	      strings, (hashsize - strings) * 100 / hashsize);
    }

  /* String for strcmp(). */
  if (docheck) {
#ifdef HAVE_STRING_H
    fprintf(output, "#include <string.h>\n\n");
#else
    fprintf(output, "#include <strings.h>\n\n");
#endif
  }
  
  if (hashtable)
    {
      /* Output hash table */

      fprintf(output, "char *%s_table[%d] = {\n", hashname, hashsize);
      for (count = 0; count < hashsize - 1; count++)
	print_string_table_entry(output, hashed[count], mismatch_nulls, FALSE);
      print_string_table_entry(output, hashed[count], mismatch_nulls, TRUE);
      fputs("};\n\n", output);
    }
  
  /* Output hash function for unlimited strategy:
   * unsigned int hash_function(s)
   *   register char *s;
   * {
   *   register int hash;
   *   hash = 0;
   *   for (; *s; s++) hash += (unsigned int) *s;
   *   return hash & <bitmask>;
   * }
   * Output hash function for unlimited xor:
   * unsigned int hash_function(s)
   *   register char *s;
   * {
   *   register int hash;
   *   hash = 0;
   *   for (; *s; s++) hash = hash ^ (unsigned int) *s;
   *   return hash & <bitmask>;
   * }
   * for unlimited lowbits
   * unsigned int hash_function(s)
   * {
   *   register int hash;
   *   hash = 0;
   *   for (; *s; s++) hash += (hash << lowbits)
   *				| ((unsigned int) *s & mask[lowbits]);
   *   return hash & <bitmask>;
   * }
   * and for limited strategy:
   * unsigned int hash_function(s)
   *   register char *s;
   * {
   *   register int hash, count;
   *   hash = 0;
   *   for (count = 0; *s && count < <limit>; s++)
   *	 hash += (unsigned int) *s;
   *   return hash & <bitmask>;
   * }
   * and for one character limited strategy:
   * unsigned int hash_function(s)
   * {
   *   return (unsigned int) *s & <bitmask>;
   * }
   */

  /* make bitmask: 1-item table -> 01B, 2-item table -> 011B, ... */
  if (hashsize == 1)
    bitmask = 1;
  else
    bitmask = hashsize - 1;

  fprintf(output, "unsigned int %s_function(s)\n", hashname);
  fprintf(output, "     register char *s;\n");
  fprintf(output, "{\n");

  if (docheck)
    fprintf(output, "  register char *original_string = s;\n\n");
  
  if (strategy == 1)
    fprintf(output, "  return (unsigned int) *s & %d;\n", bitmask);
  else
    {
      if (strategy == STRATEGY_UNLIMITED ||
	  strategy == STRATEGY_UNLIMITED_XOR ||
	  strategy == STRATEGY_UNLIMITED_LOWBITS)
	{
	  fprintf(output, "  register int hash = 0;\n");
	  fprintf(output, "  for (; *s; s++)\n");

	  if (strategy == STRATEGY_UNLIMITED_XOR)
	    fprintf(output, "    hash += hash ^ (unsigned int) *s;\n");
	  else if (strategy == STRATEGY_UNLIMITED_LOWBITS)
	    fprintf(output,
		    "    hash += (hash << %d) | ((unsigned int) *s & %d);\n",
		    lowbits, mask[lowbits]);
	  else
	    fprintf(output, "    hash += (unsigned int) *s;\n");
	}
      else
	{
	  fprintf(output, "  register int hash = 0, count;\n");
	  fprintf(output, "  for (count = 0; *s && count < %d; s++, count++)\n",
		  strategy);
	  fprintf(output, "    hash += (unsigned int) *s;\n");
	}

      if (docheck)
	{
	  fprintf(output, "\n  hash = hash & %d;\n", bitmask);
	  fprintf(output,
		  "  if (strcmp(%s_table[hash], %s))\n    return -1;\n",
		  hashname, "original_string");
	  fprintf(output, "  return hash;\n");
	}
      else
	fprintf(output, "  return hash & %d;\n", bitmask);
    }
  fprintf(output, "}\n");
  
  if (*outputfile) fclose(output);

  /* Output defines, if asked for */
  
  if (defines)
    {
      if (*outputfile)
	{
	  p = strchr(outputfile, '.');
	  if (!p)
	    {
	      fprintf(stderr, "panic: dot disappeared from output %s (%s)\n",
		      "filename", outputfile);
	      exit(EX_SOFTWARE);
	      /*NOTREACHED*/
	    }
	  (void) strcpy(p, ".h");
	  if ((output = fopen(outputfile, "w+")) == NULL)
	    {
	      fprintf(stderr, "panic: Cannot open output header file %s\n",
		      outputfile);
	      exit(EX_UNAVAILABLE);
	      /*NOTREACHED*/
	    }
	}
      
      for (count = 0; count < hashsize; count++)
	{
	  if (hashed[count])
	    {
	      fprintf(output, "#define %s", define_head);
	      for (p = hashed[count]; *p; p++)
		{
		  if (islower(*p))
		    putc(toupper(*p), output);
		  else if (*p == '-')
		    putc('_', output); /* Defines cannot include - */
		  else if (*p == '{')
		    fputs("LEFTBRACE", output);
		  else if (*p == '}')
		    fputs("RIGHTBRACE", output);
		  else if (*p == '#')
		    fputs("HASH", output);
		  else if (*p == '`')
		    fputs("BACKQUOTE", output);
		  else if (*p == '\'')
		    fputs("QUOTE", output);
		  else if (*p == ':')
		    fputs("COLON", output);
		  else if (*p == ';')
		    fputs("SEMICOLON", output);
		  else
		    putc(*p, output);
		}
	      fprintf(output, " %d\n", count);
	    }
	}

      
      /* Declaration for hash function */
      
      fprintf(output, "\n#ifdef __STDC__\n");
      fprintf(output, "extern unsigned int %s_function(char *);\n", hashname);
      fprintf(output, "#else\n");
      fprintf(output, "extern unsigned int %s_function();\n", hashname);
      fprintf(output, "#endif\n");
      if (hashtable)
	{
	  fprintf(output, "\nextern char *%s_table[%d];\n",
		  hashname, hashsize);
	}
    }
  exit(EX_OK);
  return EX_OK;			/* Just to make gcc happy */
}



