/* copyright (c) CNIDR (see ../COPYRIGHT)

7/29/92

This program is an attempt to scan into the dictionary and inverted file to determine the keywords that best
describe a database.  These could then be included in the description file
$Log:	irkeywords.c,v $
 * Revision 1.3  93/06/23  19:57:52  warnock
 * Fix from tovio@sage.ucs.uwa.edu.au for empty keywords array
 * 
 * Revision 1.2  93/02/16  17:07:49  freewais
 * added AT&T patches for keyword list
 * 

*/

#include <string.h>
#include <sys/types.h>
#include <sys/param.h>
#include "irdirent.h"
#include "cutil.h"
#include "futil.h"
#include "irfiles.h"
#include "irtfiles.h"
#include "panic.h"
#include "ircfiles.h"
#include "version.h"
#include "irext.h"
#include "irlex.h"

/* FILE *logfile; */

unsigned char *dictionary_header = NULL; /* the dictionary header. 
					    loaded once */

long number_of_blocks = 0;  /* also the length of the dictionary 
			       header block */

unsigned char *dictionary = NULL; /* this is one of the dict blocks */

char *keyword[100];
long keyvalue[100];
long stored;
short nKeys = 0;

int
retreive_keywords(db)
     database* db;
{
  long i,j, k, l, tmp, limit;
  char file[MAX_FILE_NAME_LEN + 1 ];
  double x;
  char *ptr;
  int tmpval;

  if(NULL == dictionary_header) {
    FILE *stream = db->dictionary_stream;
    s_fclose(stream);
    db->dictionary_stream = s_fopen(dictionary_filename(file, db), "r+b");
    stream = db->dictionary_stream;
    s_fseek(stream, 0L, SEEK_SET);
    number_of_blocks = read_bytes(DICTIONARY_HEADER_SIZE,stream);
    dictionary_header=
      read_dictionary_block(dictionary_header,DICTIONARY_HEADER_SIZE,
			    number_of_blocks,stream);
    if(NULL == dictionary_header) {
      printf("Could not read dictionary header block in db %s.", db->database_file);
      return(0);
    }
  }
  look_up_total_word_count(db);
  stored = 0;
  for (i=0; i<=number_of_blocks; i++) {
    FILE *stream = db->dictionary_stream;
    dictionary = read_dictionary_block(dictionary,dictionary_block_position(i, dictionary_header),
				       DICTIONARY_BLOCK_SIZE, stream);
/*
 * I think we shouldn't loop past the end of the DICTIONARY_BLOCK_SIZE, eh?
 * - aw3 12/94
 *
 *   for (j=0; j<=DICTIONARY_BLOCK_SIZE; j++) 
 */
    for (j=0; j<DICTIONARY_BLOCK_SIZE; j++)
      if (strlen(dictionary_block_word(j, dictionary))) {
	tmp = dictionary_block_word_occurances(j, dictionary);
	if (tmp == db->total_word_count)
	  goto done;
	k=0;
	while (k<stored) {
	  if (tmp > keyvalue[k])
	    break;
	  k++;
	}
	if (k == stored) {
	  if (stored < 50) {
	    keyvalue[stored]= tmp;
	    tmpval=strlen(dictionary_block_word(j, dictionary))+1;
	    keyword[stored++] = s_malloc(tmpval);
	    strcpy(keyword[stored-1], dictionary_block_word(j, dictionary));
	  }
	} else {
	  if (stored < 50)
	    stored++;
	  for (l=stored-1; l>k; l--) {
	    if (l==49)
	      free(keyword[l]);
	    keyword[l] = keyword[l-1];
	    keyvalue[l] = keyvalue[l-1];
	  }
	  tmpval=strlen(dictionary_block_word(j, dictionary))+1;
	  keyword[k] = s_malloc(tmpval);
	  strcpy(keyword[k], dictionary_block_word(j, dictionary));
	  keyvalue[k] = tmp;
	}
      }
  }
 done:
  /* done getting, now cull and sort */
  x = 0;
  for (i=0; i<50 && x<0.2; i++)
    x += ((double) keyvalue[i])/db->total_word_count;
  limit = i;
  /* patch from tovio@sage.ucs.uwa.edu.au to fix problem with empty
   *	keywords array
   */
  if (i>0)
    for (i=0; i<=limit-1; i++) {
      for (j=i; j<limit; j++)
	if (strcmp(keyword[i],keyword[j])>0) {
	  ptr=keyword[i]; keyword[i]=keyword[j]; keyword[j]=ptr;
	}
    }
  nKeys = limit;
  return(1);
}
