
/* Copyright (c) CNIDR (Work in progress) */

/* WIDE AREA INFORMATION SERVER SOFTWARE
   No guarantees or restrictions.  See the readme file for the full standard
   disclaimer.    
   Brewster@think.com
*/


/* implements the search part of irext.h 
   (search_word and finished_search_word)
   -brewster

Split from irsearch.c

   5/31/91 Added scale_scores.  Fixed document_score_array to long.
   7/8/91 Removed scale_scores, handled in search_word with doc_id > 0.
   2/4/92 Made document_score_array a double.

   - Jonny G
 * $Log: sersrch.c,v $
 * Revision 1.49  1994/07/13  07:52:36  huynh1
 * Uli
 *
 * Revision 1.48  1994/05/27  09:13:21  huynh1
 * boolean code updated. beta
 *
 * Revision 1.47  1994/05/26  14:33:57  huynh1
 * search_word updated (read_weight_from_stream).
 * beta.
 *
 * Revision 1.46  1994/05/20  12:49:58  pfeifer
 * beta
 *
 * Revision 1.45  1994/05/19  12:44:39  huynh1
 * search_word updated.
 *
 * Revision 1.44  1994/05/18  17:28:13  huynh1
 * new term weighting
 * higher retrieval quality.
 *
 * Revision 1.40  1994/04/28  16:28:01  huynh1
 * stemming
 *
 * Revision 1.39  1994/04/06  23:52:04  huynh1
 * 08, autoconf, Uli
 *
 * Revision 1.38  1994/03/23  13:11:07  pfeifer
 * removed include iso.h
 *
 * Revision 1.37  1994/03/08  20:46:12  huynh1
 * Patchlevel 04
 *
 * Revision 1.36  1994/02/14  10:33:04  huynh1
 * new code for field concept added.
 *
 * Revision 1.36  1993/12/08  17:38:00  huynh1
 * bug by mixing literal and nested boolean corrected!
 *
 * Revision 1.10  1993/10/13  14:14:20  huynh1
 * new code added for encapsulated boolean queries and
 * modified literal search
 *
 * Revision 1.3  1993/07/13  08:19:56  pfeifer
 * Sicherung vor Aenderungen Tung
 *
 * Revision 1.1  1993/02/16  15:05:35  freewais
 * Initial revision
 *
 * Revision 1.24  92/04/28  16:56:54  morris
 * added boolean to serial engine
 * 
 * Revision 1.23  92/03/15  10:15:18  jonathan
 * Added Simon Spero's ASSIGN replacement for read_bytes.
 * 
 * Revision 1.22  92/03/05  07:09:54  shen
 * add two more dummy arguments to call to init_search_engine
 * 
 * Revision 1.21  92/02/12  17:29:52  jonathan
 * Conditionalized inclusion of object code.
 * 
 * Revision 1.20  92/02/12  13:40:06  jonathan
 * Added "$Log" so RCS will put the log message in the header
 * 
*/

#include "cutil.h"
#include "irfiles.h"
#ifdef BIO
#include "irtfiles.h" /* dgg, for wordDelimiter */
#endif
#include "irsearch.h"
#include "irext.h"
#include "byte_order.h"
/* #include <string.h> */
#include <ctype.h>

#include <math.h>

#define MAXINT (unsigned long)2^(sizeof(long)*8-1)
#define VALUE 1000000L
/* francois */
#include "stemmer.h"   

/* tung, 10/93 */
#ifdef NESTED_BOOLEANS
#include "boolean_op.h"
#endif
/* tung, 10/93 */

#ifdef FIELDS /* tung, 1/94 */
#include "field_search.h"
#endif

#ifdef NEW_WEIGHT /* tung, 5/94 */
#include "weight.h"
#endif

#ifdef BOOL
#include "obj.h"
#include "irparse.h"
object* currentQuery = NULL; /* kludge until irext goes away */
#endif /* def BOOL */

/* weighting for relevant document terms - 
   this may become a parameter to the query.
*/

#define RF_WEIGHTING 0.1

/* ==================================
 * ===  Initialization Functions  ===
 * ==================================*/


long init_search_engine(file, initialize, for_search, cm_mem_percent, 
text_size, grow_percent)
  char* file;
  boolean initialize;
  boolean for_search;
  long cm_mem_percent;  /* unused */
  long text_size;     /* unused */
  long grow_percent;  /* unused */
{
  static boolean inited = false;

  if (inited == false)
   { 
#ifdef BOOL
     initObj();
     initBool();
#endif
     inited = true;
   }

  return(0);
}

long finished_search_engine()
{
  return(0);
}


/*
 *  ext_open_database: see irext.h
 */

long ext_open_database (db, initialize, for_search)
     database *db;
     boolean initialize;
     boolean for_search;
{ /* this has to deal with the .inv file */
  char file[MAX_FILE_NAME_LEN];

  if(initialize) /* make a new one */
    db->index_stream = s_fopen(index_filename(file, db), "w+b");
  else if(for_search) /* just search */
    db->index_stream = s_fopen(index_filename(file, db), "rb");
  else /* write to an existing db */
    db->index_stream = s_fopen(index_filename(file, db), "r+b");

  if (db->index_stream == NULL) {
    waislog(WLOG_HIGH, WLOG_ERROR,"2can't open the inverted index file %s\n", 
	    file);
    disposeDatabase(db);
    return(1);
  }
  return(0);
}
  


/*
 *  ext_close_database: see irext.h
 */

long ext_close_database (db)
     database *db;
{
  return(0);
}

char *database_file(database_name)
     char *database_name;
{
  return(database_name);
}
  
/*===========================*
 *===  Setting Paramters  ===*
 *===========================*/

long max_hit_retrieved = 0;
char **srcs = NULL;

long set_query_parameter (mask, parameters)
     long mask;
     query_parameter_type * parameters;
{
  switch (mask)
    {
    case SET_MAX_RETRIEVED_MASK:
      max_hit_retrieved = parameters->max_hit_retrieved;
      return(0);
      break;
    case SET_SELECT_SOURCE:
      if(NULL != srcs){
	if(NULL != srcs[0])
	  s_free(srcs[0]);
	s_free(srcs);
      }
      srcs = parameters->srcs;
      break;
    default:
      return(-1);
      break;
    }
  return(0);
}

/*==============================*
 *===  Document Score Array  ===*
 *==============================*/

double *document_score_array = NULL;
long document_score_array_len = 0;
#ifdef NESTED_BOOLEANS /* tung, 1/94 */
double *NumPart_score_array = NULL;
#else
#ifdef BOOLEANS
double *prev_score_array = NULL;					/* 12/91 GS TLG */
#endif
#endif

#ifdef NESTED_BOOLEANS
/* tung, 10/93 */
search_result_struct *search_result_array = NULL;
long operand_id = 0;

static void clear_search_result_array _AP((long* number_of_elements));
static void clear_search_result_array(number_of_elements)
     long* number_of_elements;
{
  long count;
  
  if(*number_of_elements > 1 && search_result_array != NULL) {
    for(count=0; count < *number_of_elements; count++) {
      if(search_result_array[count].doc_ids_array != NULL) 
        s_free(search_result_array[count].doc_ids_array);
    }
    s_free(search_result_array);
  }
  *number_of_elements = 1;
}

static void make_search_result_array _AP((long length));
static void make_search_result_array(length)
     long length;
{
  if(search_result_array == NULL) {
    search_result_array = 
      (search_result_struct *)
        s_malloc((size_t)(length * sizeof(search_result_struct)));
    operand_id = 0;
  }
}

static boolean make_doc_ids_array _AP((long pos, long length));
static boolean make_doc_ids_array(pos, length)
     long pos;
     long length;
{
  /* if(search_result_array[pos].doc_ids_array == NULL) */
  search_result_array[pos].doc_ids_array =
    (doc_descr_struct *)
      s_malloc((size_t)(sizeof(doc_descr_struct) * length));
  if(search_result_array[pos].doc_ids_array == NULL) {
    waislog(WLOG_HIGH, WLOG_ERROR, "Out of memory");
    return(false);
  }
  return(true);
}
/* tung, 10/93 */
#endif

/* make_document_score_array insures that the document_score_array
   array is long enough, if not it makes it long enough */
static void make_document_score_array _AP((long length ));
static void make_document_score_array(length)
long length;
{
  if(length <= document_score_array_len)
    return;
  /* we have to make a new one.  free the old one first (if any) */
  if(document_score_array != 0){
    s_free(document_score_array);
#ifdef NESTED_BOOLEANS /* tung, 1/94 */
    s_free(NumPart_score_array);
#else
#ifdef BOOLEANS
    s_free(prev_score_array);					/* 12/91 GS TLG */
#endif
#endif
  }
  document_score_array = (double*)s_malloc((size_t)(length * sizeof(double)));
#ifdef NESTED_BOOLEANS /* tung, 1/94 */
  NumPart_score_array   = (double*)s_malloc((size_t)(length * sizeof(double)));
  memset(NumPart_score_array, 0,
	 document_score_array_len * sizeof(double));
#else
#ifdef BOOLEANS
  prev_score_array   = (double*)s_malloc((size_t)(length * sizeof(double)));  /* 12/91 GS TLG */
#endif
#endif
  document_score_array_len = length;
}

static void destroy_document_score_array _AP((void));
static void destroy_document_score_array()
{
  s_free(document_score_array);
#ifdef NESTED_BOOLEANS /* tung, 1/94 */
  s_free(NumPart_score_array);
#else
#ifdef BOOLEANS
  s_free(prev_score_array);					/* 12/91 GS TLG */
#endif
#endif
  document_score_array_len = 0;
}
    
void clear_document_score_array()
     /* side effects the document_score_array. */
{ 
  memset(document_score_array, 0, 
	 document_score_array_len * sizeof(double));
#ifdef NESTED_BOOLEANS /* tung, 1/94 */
  memset(NumPart_score_array, 0,
         document_score_array_len * sizeof(double));
#else
#ifdef BOOLEANS
  memset(prev_score_array, 0,					/* 12/91 GS TLG */
         document_score_array_len * sizeof(double));		/* 12/91 GS TLG */
#endif
#endif
}

/* for debugging purposes */
void print_document_score_array(start,stop)
unsigned long start;
unsigned long stop;
/* assumes start >= 0, stop < db->doc_table_allocated_entries */
{
	long i;
	for(i = start; i <= stop; i++){
		printf("entry number %d: %f \n", 
		       i, document_score_array[i]);
	}
}



/*=========================*
 *===  Best Hits Array  ===*
 *=========================*/

hit *best_hits_array = NULL;
long best_hits_array_len = 0;
long current_best_hit = 0;
long doc_start = 0; /* tung, 5/94 */
long doc_end = 0;   /* tung, 5/94 */

/* see irext.h for doc */
long init_best_hit (db)
     database *db;
{

#ifdef BOOL
  if (currentQuery != NULL)
    send(currentQuery,InitBestHit,db);
#endif /* def BOOL */

  return(0);
}

/* make_best_hits_array insures that the best_hits_array
   array is long enough, if not it makes it long enough */
static void make_best_hits_array _AP((long length));
static void make_best_hits_array(length)
long length;
{
  if(length <= best_hits_array_len)
    return;
  /* we have to make a new one.  free the old one first (if any) */
  if(best_hits_array != 0){
    s_free(best_hits_array);
  }
  best_hits_array = (hit*)s_malloc((size_t)(length * sizeof(hit)));
  best_hits_array_len = length;
}

static void destroy_best_hits_array _AP((void));
static void destroy_best_hits_array()
{
  s_free(best_hits_array);
  best_hits_array_len = 0;
}
    
void clear_best_hits_array()
/* side effects the best_hits_array.  XXX could use memset */
{ 
  memset((char*)best_hits_array, 0, best_hits_array_len * sizeof(hit));
}

/* for debugging purposes */
void print_best_hits()
{
  long i;
  for( i = 0; i < best_hits_array_len; i++){
    if (best_hits_array[i].weight != 0)
      { printf("Best hit %ld: weight %lf, doc_id %ld, headline %s, filename %s, lines %ld\n", 
	       i, best_hits_array[i].weight, 
	       best_hits_array[i].document_id,
	       best_hits_array[i].headline,
	       best_hits_array[i].filename,
	       best_hits_array[i].number_of_lines);
      }
  }
}

void sort_best_hits(db)
     database * db;
{
  /* returns nothing.
   * side effects best_hits and document_score_array
   */

  long i, doc;
  double worst_weight_to_make_it = 0.0;
  document_table_entry doc_entry;
  long best_hit_number = 0;

  /* snuff the scores */
  for(i = 0; i < max_hit_retrieved; i++){
    best_hits_array[i].weight = 0.0;

  }

  /* loop over the doc, and keep the doc_id and weight in best hit table */
  /* for(doc = 0; doc < db->doc_table_allocated_entries; doc++){ */
  for(doc = doc_start; doc <= doc_end; doc++) {
    double weight = document_score_array[doc];
    /* jmf */
    if(weight > 0) {
#ifndef NEW_WEIGHT /* tung, 5/94 */
      read_document_table_entry(&doc_entry, doc, db);  /* if this could be
							  removed, we'd gain speed */
      if (doc_entry.document_length)
	weight/=doc_entry.document_length;
      else
        weight = 0;
#endif
      if(worst_weight_to_make_it < weight){
	/* merge it into the best_hits array. start at the bottom */
	for(i = (max_hit_retrieved - 1); i >= 0; i--){
	  if(weight > best_hits_array[i].weight 
	     /* && (check_document_id(doc, db) == true) too slow.*/
	     ){
	    /* move this entry down */	
	    if((i + 1) < max_hit_retrieved){
	      best_hits_array[i+1].weight = best_hits_array[i].weight;
	      best_hits_array[i+1].document_id = best_hits_array[i].document_id;
	    }
	    best_hits_array[i].document_id = doc;
	    best_hits_array[i].weight = weight;
	  }
	  else
	    break;
	}      
      }
    }
  }
  doc_start = doc_end = 0; /* tung, 5/94 */
  for(i = 0; i < max_hit_retrieved; i++){
    if(best_hits_array[i].weight <= 0.0)
      return;
    if (read_document_table_entry(&doc_entry,
				  best_hits_array[i].document_id,
				  db) 
	== true){
      best_hits_array[best_hit_number].weight = best_hits_array[i].weight;

      best_hits_array[best_hit_number].document_id = best_hits_array[i].document_id;
      best_hits_array[best_hit_number].start_character = doc_entry.start_character;
      best_hits_array[best_hit_number].end_character = doc_entry.end_character;
      best_hits_array[best_hit_number].document_length = doc_entry.document_length;
      best_hits_array[best_hit_number].number_of_lines = doc_entry.number_of_lines;
      sprintf(best_hits_array[best_hit_number].date, "%d", doc_entry.date);
      read_filename_table_entry(doc_entry.filename_id, 
				best_hits_array[best_hit_number].filename,
				best_hits_array[best_hit_number].type,
				NULL,
				db),
      strncpy(best_hits_array[best_hit_number].headline, 
	      read_headline_table_entry(doc_entry.headline_id,db),
	      MAX_FILE_NAME_LEN);
      best_hit_number++;
    } 
    beFriendly();
  }
  for(i = best_hit_number; i < max_hit_retrieved; i++){
    best_hits_array[best_hit_number].weight = 0.0;
  }
  /* print_best_hits(s);  for debugging */
}


/* returns the next best hit */
long best_hit(db, doc_id, best_character, best_line, score,start,end,date,
length,nlines,headline,filename,type)
     database *db;
     long *doc_id;	
     long *best_character;
     long *best_line;
     double *score;
	long *start,*end,*date,*length,*nlines;
char *headline,*filename,*type;
{
double tmp;

  *best_character = 0; 
  *best_line = 0;
  
#ifdef BOOL
  if (currentQuery != NULL) /* for boolean */
   {
     send(currentQuery,GetBestHit,db,doc_id,best_character,best_line,score);
     if (*doc_id > 0)
       return(0); /* ok */
     else
       return(-1); /* no more docs */
   }
#endif /* BOOL */

  if(current_best_hit > best_hits_array_len)
    return(1);
  if(best_hits_array[current_best_hit].weight == 0.0)
    return(1);
  *doc_id = best_hits_array[current_best_hit].document_id;
  tmp  = ((double)(best_hits_array[current_best_hit].weight*VALUE));
*score=tmp;
*start=best_hits_array[current_best_hit].start_character;
*end=best_hits_array[current_best_hit].end_character;
*date=atoi(best_hits_array[current_best_hit].date);
*length=best_hits_array[current_best_hit].document_length;
*nlines=best_hits_array[current_best_hit].number_of_lines;
strcpy(headline,best_hits_array[current_best_hit].headline);
strcpy(filename,best_hits_array[current_best_hit].filename);
strcpy(type,best_hits_array[current_best_hit].type);
  current_best_hit++;
  return(0);
}

long finished_best_hit(db)
database *db;
{ 

#ifdef BOOL
  if (currentQuery != NULL) /* for boolean */
   { send(currentQuery,Delete);
     currentQuery = NULL;
     return(0);
   }
#endif /* BOOL */

  /* if we are on a small machine, we might want to 
     destroy_document_score_array */
  clear_document_score_array();
  clear_best_hits_array();
  current_best_hit = 0;
  return(0);
}

/*=============================*	
 *===  Searching for words  ===*
 *=============================*/

/* see irext.h for doc */
long init_search_word (db)
     database* db;
{
char fn[256];
  strcpy( fn,db->database_file );
  strcat( fn,synonym_ext );
  syn_ReadFile( fn,&db->syn_Table,&db->syn_Table_Size );

  return(0);
}

#ifdef NESTED_BOOLEANS
/* tung, 10/93 */
extern long number_of_operands ;
/* tung, 10/93 */
#endif

#ifdef BOOLEANS
static boolean   gLastAnd= false;
static boolean   gLastNot= false;
#endif

/* see irext.h for doc */
long search_word(word,
#ifdef FIELDS /* tung, 5/94 */
		 field_name,
#endif
		 char_pos, line_pos, weight, doc_id, 
		 word_pair, db)
     char *word; /* the word to be searched for */
#ifdef FIELDS /* tung, 5/94 */
     char *field_name;
#endif
     long char_pos;		/* the position of the start of the word */
     long line_pos;		/* is this needed? not for signature system */
     long weight;		/* how important the word looks syntactically,
				   such as is it bold */
     long doc_id;		/* current document, seed words is 0,
				   then it increments into the relevant 
				   document */
     long word_pair;
     database *db;
{
  /* this side effects the document_score_array,
   * and downcases the word.
   * Returns 0 if successful or word not present, 
   * returns non-0 if an error.
   *
   */
  
  long not_full_flag = INDEX_BLOCK_FULL_FLAG; /* start out full so it will go on looking */
  long count, index_block_size;
  long internal_document_id,  number_of_valid_entries;
  double internal_weight;
  long index_file_block_number;
  long number_of_occurances;

  FOUR_BYTE index_buffer_data[INDEX_ELEMENT_SIZE*(1024/4)];
  char *index_buffer;
#ifdef undef
  char *i = index_buffer;       /* What the hell should be in i ? (up) */
#endif
  FILE *stream = NULL;


#ifdef LITERAL
  long txt_pos, icnt, wcnt, pcnt;					/* 2/92 GS TLG */
  document_table_entry doc_entry;					/* 2/92 GS TLG */
  static FILE *txt_stream = NULL;					/* 2/92 GS TLG */
  char cmpr_word[MAX_PHRASE_LENGTH + 1];				/* 2/92 GS TLG */
  /*char phrase[MAX_PHRASE_LENGTH + 1];	*/				/* 2/92 GS TLG */
  char txt_filename[MAX_FILENAME_LEN + 1];				/* 2/92 GS TLG */
  char *temp_txt_filename = NULL;					/* francois */
  char prev_txt_filename[MAX_FILENAME_LEN + 1];				/* 2/92 GS TLG */
  char txt_type[MAX_TYPE_LEN + 1];					/* 2/92 GS TLG */
  long phraselen= 0, txt_pos_fix= 0;
  char *document_section = NULL;     /* tung , 10/93 */
  long document_section_len = 0;     /* tung , 10/93 */
  long phrase_readed = 0;            /* tung , 10/93 */
  long phrase_count = 0;             /* tung , 10/93 */
  boolean phrase_found = false;      /* tung , 10/93 */
#endif

#ifdef NESTED_BOOLEANS /* tung, 10/93 */
  long numeric_partial_valid_entries = 0;
#endif

#ifdef NEW_WEIGHT /* tung, 5/94 */
  double query_wgt;
#else
  double idf;
#endif
#ifdef FIELDS /* tung, 12/93 */
  long field_id = -1;
  boolean SearchField = false;
#endif

  /* do synonym conversion */
  
  /* in theory, one can replace a word with a boolean phrase */
  char *newword;

  newword = lookup_Synonym( word,db->syn_Table,db->syn_Table_Size );
  waislog(WLOG_HIGH,WLOG_INFO,"Word %s Syn %s",word,newword);
  strncpy(word,newword,MAX_WORD_LENGTH);

#ifdef FIELDS /* tung, 12/93 */
  if(db->number_of_fields > 0) {
    if(*field_name != '\0') {
      if(strcmp(field_name, FREE_TEXT_FIELD) == 0) { /* global database */
	field_name = "\0";
        SearchField = false;
	field_id = -1;
      } else { 
	SearchField = true;
	field_id = pick_up_field_id(field_name, db);
      }
    }
  }
#endif
  
/* tung, 10/93 */
#ifdef NESTED_BOOLEANS
  if(number_of_operands > 1) {
    make_search_result_array(number_of_operands);
    if((weight!=LITERAL_FLAG) && IsOperator(word)) {
      boolean_operations(word, search_result_array);
      return(0);
    }
    if(strlen(word) == 1) {
      search_result_array[operand_id].number_of_hits = 0;
      search_result_array[operand_id].operand_id = operand_id;
      if(!save_operand_id(operand_id, search_result_array, db->doc_table_allocated_entries))
	return(-1);
      ++operand_id;
      return(0);
    }
  }
#endif
/* tung, 10/93 */

  /* francois - call the stemmer */
#ifdef FIELDS /* tung, 1/94 */
  if(weight!=LITERAL_FLAG &&  weight!= FIELD_FLAG && weight!= NUMERIC_FLAG) {
#ifdef STEM_WORDS
    if(field_id > -1) {
      if(db->fields[field_id].stemming)
	stemmer(word);  
    }
    else {
      if(db->stemming)
	stemmer(word);
    }
#endif
  }
#else
#ifdef LITERAL
  if (weight!=LITERAL_FLAG) {
	stemmer(word); 
  }
#else
  stemmer(word); 
#endif
#endif

#ifdef LITERAL
  if (weight==LITERAL_FLAG) {
    /* goto after_booleans */
/* printf("search_word: literal word is [%s]\n", word); */
    }
  else 
#endif

#ifndef NESTED_BOOLEANS /* 10,93 */
#ifdef BOOLEANS
  if (strcmp(word,BOOLEAN_AND)==0) {  /* should be all lowercase cmp here */
    gLastAnd= true;
    return(0);
    }
  else if (strcmp(word,BOOLEAN_NOT)==0) {  
  /* ^^ this is bad if we intersperse "not"s in a query --
     docs found after not word may include notted word --
     need to go back to doing not words after others --
     but need now to check for literal string first
  */
    gLastNot= true;
    return(0);
  }
  if (weight == BOOLEAN_NOT_FLAG) gLastNot= true;
#else
     ;   /* if not LITERAL_FLAG */
#endif
#endif /* #ifndef NESTED_BOOLEANS */

  index_buffer = (char*)index_buffer_data;

#ifdef LITERAL
  if (weight==LITERAL_FLAG) {
  /* note: we found the first word of phrase once in map_over_words, but i'm too lazy 
  	to put another parameter in that cascade of function calls it takes
	to get here.
  */
    char  word1[MAX_WORD_LENGTH + 1];
    register int i, len;
    register boolean more;
    phraselen= MINIMUM( MAX_PHRASE_LENGTH, strlen(word));
    len = MINIMUM( MAX_WORD_LENGTH, phraselen);
    for (i=0, more=true; i < len && more; ) {
      word1[i] = word[i++];
#ifdef BIO
      more= (wordDelimiter(word[i]) == NOT_DELIMITER); 
#else
      more= (isalnum(word[i]));
#endif
     }
    word1[i]= '\0';
    txt_pos_fix= strlen(word1) + 1;
/* printf("search_word: literal word1 is [%s]\n", word1); */
#ifdef FIELDS /* tung, 1/94 */
    if((db->number_of_fields == 0) && !SearchField)
      index_file_block_number = 
        look_up_word_in_dictionary(word1, &number_of_occurances, db);
    else 
      index_file_block_number = 
        field_look_up_word_in_dictionary(field_name, word1, &number_of_occurances, db);
#else
    index_file_block_number = 
      look_up_word_in_dictionary(word1, &number_of_occurances, db);
#endif
  }
  else
#endif  /* LITERAL */

#ifdef PARTIALWORD
#ifdef FIELDS /* tung, 1/94 */
    index_file_block_number = 
      look_up_partialword_in_dictionary(field_name, 
                                        word, &number_of_occurances, db);
#else
    index_file_block_number = 
      look_up_partialword_in_dictionary(word, &number_of_occurances, db);
#endif
#else
  index_file_block_number = 
    look_up_word_in_dictionary(word, &number_of_occurances, db);
#endif

  current_best_hit = 0;  /* so that the best hits willstart from 0 */

  /* check the document_score_array */
  if(document_score_array_len < db->doc_table_allocated_entries)
    make_document_score_array(db->doc_table_allocated_entries);

  if(index_file_block_number >= 0){
#ifdef PARTIALWORD
   while(index_file_block_number > 0){  /* dgg, need 2nd loop here for multiple partwords */
#endif

#ifdef FIELDS /* tung, 1/94 */
     if(SearchField && *field_name != '\0') 
       stream = db->field_index_streams[pick_up_field_id(field_name, db)];
     else stream = db->index_stream;
#else
     stream = db->index_stream;
#endif

    while((not_full_flag != INDEX_BLOCK_NOT_FULL_FLAG) && 
	  (index_file_block_number != 0)){	
      /* read the index block */
      if (0 != fseek(stream, (long)index_file_block_number, 
		     SEEK_SET))	
	{ 
	  waislog(WLOG_HIGH, WLOG_ERROR, 
		  "fseek failed into the inverted file to position %ld",
		  (long)index_file_block_number); 
#ifdef BOOLEANS
   gLastNot= gLastAnd= false;
#endif  
	  return(-1);
	}
/*      
      read(fileno(stream),index_buffer,INDEX_BLOCK_HEADER_SIZE);

      ASSIGN(not_full_flag,
	     INDEX_BLOCK_FLAG_SIZE,
	     index_buffer,
	     INDEX_BLOCK_HEADER_SIZE,
	     0 );
      ASSIGN(index_file_block_number,NEXT_INDEX_BLOCK_SIZE,
	     index_buffer+INDEX_BLOCK_FLAG_SIZE,
	     INDEX_BLOCK_HEADER_SIZE,
	     INDEX_BLOCK_FLAG_SIZE);
      ASSIGN(index_block_size,INDEX_BLOCK_SIZE_SIZE,
	     index_buffer+INDEX_BLOCK_FLAG_SIZE+NEXT_INDEX_BLOCK_SIZE,
	     INDEX_BLOCK_HEADER_SIZE,
	     INDEX_BLOCK_FLAG_SIZE+NEXT_INDEX_BLOCK_SIZE);

  this is equivalent, but slower:
*/
      not_full_flag = read_bytes(INDEX_BLOCK_FLAG_SIZE, stream);
      index_file_block_number = read_bytes(NEXT_INDEX_BLOCK_SIZE, stream);
      index_block_size = read_bytes(INDEX_BLOCK_SIZE_SIZE, stream);

/*  Jim's debug code commented out
      printf("flag = %d, block_num = %d, block_size = %d\n",
	     not_full_flag, 
	     index_file_block_number,
	     index_block_size);
*/
      fflush(stdout);

      if(EOF == index_block_size) 
	{ 
	  waislog(WLOG_HIGH, WLOG_ERROR, 
		  "reading from the index file failed");
#ifdef BOOLEANS
   gLastNot= gLastAnd= false;
#endif  
	  return(-1);
	}
      
      if(not_full_flag == INDEX_BLOCK_NOT_FULL_FLAG){
	/* not full */
	number_of_valid_entries = index_file_block_number;
      }
      else if(not_full_flag == INDEX_BLOCK_FULL_FLAG){
	/* full */
	number_of_valid_entries = index_block_size - INDEX_BLOCK_HEADER_SIZE;
      }
      else{			/* bad news, file is corrupted. */
	waislog(WLOG_HIGH, WLOG_ERROR, 
		"Expected the flag in the inverted file to be valid.  it is %ld",
		not_full_flag);
#ifdef BOOLEANS
   gLastNot= gLastAnd= false;
#endif  
	return(-1);
      }
      /* printf("number of valid bytes: %ld\n", number_of_valid_entries); */
      
      /* add the array to the document_score_array */
      number_of_valid_entries /= INDEX_ELEMENT_SIZE;

/* tung, 10/93 */
#ifdef NESTED_BOOLEANS   
      if((number_of_operands > 1) && (search_result_array != NULL)) {
#ifdef FIELDS /* tung, 1/94 */
        if(weight != NUMERIC_FLAG && weight != PARTIAL_FLAG) {
#else
        if(weight != PARTIAL_FLAG) {
#endif
          if(!make_doc_ids_array(operand_id, db->doc_table_allocated_entries))
	    return(-1);
          search_result_array[operand_id].number_of_hits = number_of_valid_entries;
        }
      }
#endif
/* tung, 10/93 */

#ifdef NEW_WEIGHT /* tung, 5/94 */      
	query_wgt = 1;
#else
	/* ses - idf is a fist approximation to the inverse document freq. */
	/* what it actually is  is the inverse occurance frequency which says
	 * that the significance of a word is inversly proportional to the number
	 * of times it occurs in the database */

	idf=1.0/number_of_occurances; 
#endif
      for(count=0;count <  number_of_valid_entries;count++) {
	int wgt;
	int did;
/*
	if(count%1024 == 0) {
	  read(fileno(stream),index_buffer,INDEX_ELEMENT_SIZE*
		MINIMUM(1024,number_of_valid_entries-count));
	  i=index_buffer;
	}
*/
	did = read_bytes(DOCUMENT_ID_SIZE, stream);
        (void)read_bytes(WORD_POSITION_SIZE, stream);
        txt_pos=read_bytes(CHARACTER_POSITION_SIZE, stream);
        wgt = read_bytes(WEIGHT_SIZE,stream);
#ifdef NEW_WEIGHT /* tung, 5/94 */
	internal_weight = read_weight_from_stream(NEW_WEIGHT_SIZE, stream);
#endif
/*

	ASSIGN(wgt,WEIGHT_SIZE,    
	       i+DOCUMENT_ID_SIZE+WORD_POSITION_SIZE+CHARACTER_POSITION_SIZE,
	       INDEX_ELEMENT_SIZE,
	       DOCUMENT_ID_SIZE+WORD_POSITION_SIZE+CHARACTER_POSITION_SIZE);
	ASSIGN(did,DOCUMENT_ID_SIZE,i,INDEX_ELEMENT_SIZE,0);
*/
#ifdef LITERAL
	/* dgg -- is this proper update of read form to ASSIGN form ??*/
	/* txt_pos = read_bytes(CHARACTER_POSITION_SIZE, stream);*/		/* 2/92 GS TLG */
        if ((weight == LITERAL_FLAG) && (0 == doc_id))  {		 
/*
	ASSIGN(txt_pos,CHARACTER_POSITION_SIZE,i+DOCUMENT_ID_SIZE+WORD_POSITION_SIZE,
		INDEX_ELEMENT_SIZE,DOCUMENT_ID_SIZE+WORD_POSITION_SIZE);
*/
/* printf("search_word: txtpos=%d, wgt=%d, did=%d\n", txt_pos, wgt, did); */
        }
#endif

/* Commented out as suggested by Stan Isaacs at hp.com to come up with correct
 * weights when there are multiple documents in a file
 *
 *	if(wgt>5L)
 *		wgt-=5L;
 */
#ifndef NEW_WEIGHT /* tung, 5/94 */
	internal_weight = log((double)wgt);
	internal_weight+=10.0;
#endif
	internal_document_id = did;
	if((doc_start == 0) && (doc_end == 0)) /* tung, 5/94 */
	  doc_start = doc_end = did;           /* tung, 5/94 */
	doc_start = MINIMUM(doc_start, did);   /* tung, 5/94 */
	doc_end = MAXIMUM(doc_end, did);       /* tung, 5/94 */

/*
	printf("entry %ld, Doc_id: %ld, weight %lf \n",
		count, internal_document_id, internal_weight);
	fflush(stdout);
*/
	if(EOF == wgt) 
	  { 
	    waislog(WLOG_HIGH, WLOG_ERROR, 
		    "reading from the doc-id table failed");
#ifdef BOOLEANS
   gLastNot= gLastAnd= false;
#endif  
	    return(-1);
	  }

#ifdef LITERAL         
        if ((weight == LITERAL_FLAG) && (0 == doc_id)) {		/* 2/92 GS TLG */
          if (true == read_document_table_entry(&doc_entry,		/* 2/92 GS TLG */
                                             internal_document_id, db)) /* 2/92 GS TLG */
	   {								/* 2/92 GS TLG */
            read_filename_table_entry(doc_entry.filename_id,		/* 2/92 GS TLG */
                                      txt_filename, txt_type, NULL, db);  /* 2/92 GS TLG */
/* printf("search_word: document is [%s]\n", txt_filename); */
            if (NULL == txt_stream) {
              /* francois */
              if (probe_file(txt_filename)) {
                txt_stream = s_fopen(txt_filename, "rb");
              }
              else if (probe_file_possibly_compressed(txt_filename)) {
               temp_txt_filename = s_fzcat(txt_filename);
               if (temp_txt_filename) {
 	        txt_stream = s_fopen(temp_txt_filename, "rb");
 	       }
 	      }
 	      
              strcpy(prev_txt_filename, txt_filename);
             }
            else if (0 != strcmp(txt_filename, prev_txt_filename)) {
              s_fclose(txt_stream);
              /* francois */
              if ( temp_txt_filename != NULL ) {
                unlink(temp_txt_filename);
                s_free(temp_txt_filename);
              }
              if (probe_file(txt_filename)) {
                txt_stream = s_fopen(txt_filename, "rb");
              }
              else if (probe_file_possibly_compressed(txt_filename)) {
               temp_txt_filename = s_fzcat(txt_filename);
               if (temp_txt_filename) {
 	        txt_stream = s_fopen(temp_txt_filename, "rb");
 	        }
 	      }
              strcpy(prev_txt_filename, txt_filename);		/* 2/92 GS TLG */
              }

            txt_pos += doc_entry.start_character - txt_pos_fix;  /* dgg */
            document_section_len = doc_entry.end_character - txt_pos;   /* tung, 10/93 */
            s_fseek(txt_stream, txt_pos, SEEK_SET);			/* 2/92 GS TLG */
            document_section = 
              (char*) s_malloc((size_t)((document_section_len+1)*sizeof(char))); /* tung, 10/93 */
            fgets(document_section, document_section_len, txt_stream);   /* tung, 10/93 */
            phrase_readed = 0;                                           /* tung, 10/93 */
            phrase_readed += strlen(document_section);                   /* tung, 10/93 */
            document_section = string_downcase(document_section);        /* tung, 10/93 */
#if 0
            fread(phrase, 1L, phraselen, txt_stream);			/* 2/92 GS TLG */
            /* { phrase[phraselen]= '\0';
               printf("search_word: file phrase is [%s]\n", phrase); 
               } */
            if (0 != strncasecmp(word, phrase, phraselen))		/* 2/92 GS TLG */
              internal_weight = 0.0;	                                /* 2/92 GS TLG */
#endif				
            if (NULL == strstr(document_section, word)) {                    /* tung, 10/93 */
              while(phrase_readed < document_section_len) {                  /* tung, 10/93 */
                fgets(document_section, document_section_len, txt_stream);   /* tung, 10/93 */
                phrase_readed += strlen(document_section);                   /* tung, 10/93 */
                document_section = string_downcase(document_section);        /* tung, 10/93 */
                if(strstr(document_section, word) != NULL)  {                /* tung, 10/93 */
                  phrase_found = true;                                       /* tung, 10/93 */
                  break;                                                     /* tung, 10/93 */
                }                                                            /* tung, 10/93 */
              }                                                              /* tung, 10/93 */
              if(phrase_found == false)                                      /* tung, 10/93 */
                internal_weight = 0.0;					     /* tung, 10/93 */
              phrase_found = false;                                          /* tung, 10/93 */
            } 
            s_free(document_section);                                        /* tung, 10/93 */
          }
        }								
#endif

#ifndef NESTED_BOOLEANS /* 10,93 */
#ifdef BOOLEANS
	if (gLastNot) {    
	   document_score_array[internal_document_id] = 0;
/* 	printf("search_word: boolean 'not' scored\n"); */
        }
        else 
#endif
#endif /* #ifndef NESTED_BOOLEANS */
	{
	/* if(doc_id > 0) we are doing a relevant document */
/*
printf("wgt: %ld, internal weight: %lf, idf: %lf occurances: %ld\n",
	wgt,internal_weight, idf,number_of_occurances);
fflush(stdout);
*/
#ifndef NEW_WEIGHT /* tung, 5/94 */
        internal_weight*=idf; /* ses - for inverse doc. freq. */
#endif
#ifndef NESTED_BOOLEANS
#ifdef NEW_WEIGHT /* tung, 5/94 */
	document_score_array[internal_document_id] += 
	  (query_wgt *  internal_weight);
#else
	document_score_array[internal_document_id] += 
	  (doc_id) ? (internal_weight * RF_WEIGHTING) : internal_weight;
#endif
#else
/* tung, 10/93 */
        if(number_of_operands == 1) {
#ifdef NEW_WEIGHT /* tung, 5/94 */
	  document_score_array[internal_document_id] += 
	    (query_wgt *  internal_weight);
#else
          document_score_array[internal_document_id] += 
	    (doc_id) ? (internal_weight * RF_WEIGHTING) : internal_weight;
#endif
        }
        else {
          if((number_of_operands > 1) && (search_result_array != NULL)) {
            if(weight == LITERAL_FLAG) {
#ifdef NEW_WEIGHT /* tung, 5/94 */
	      ((search_result_array[operand_id]).doc_ids_array[phrase_count]).score +=
		(query_wgt *  internal_weight);
#else
              ((search_result_array[operand_id]).doc_ids_array[phrase_count]).score +=
                (doc_id) ? (internal_weight * RF_WEIGHTING) : internal_weight;
#endif
              if(((search_result_array[operand_id]).doc_ids_array[phrase_count]).score > 0) {
                ((search_result_array[operand_id]).doc_ids_array[phrase_count]).doc_id = internal_document_id;
                phrase_count++;
                search_result_array[operand_id].number_of_hits = phrase_count;
              }
            }
#ifdef FIELDS /* tung, 1/94 */
            else if(weight == NUMERIC_FLAG || weight == PARTIAL_FLAG) {
#else
            else if(weight == PARTIAL_FLAG) {
#endif
              if(NumPart_score_array[internal_document_id] <= 0)
                ++numeric_partial_valid_entries;
#ifdef NEW_WEIGHT /* tung, 5/94 */
	      NumPart_score_array[internal_document_id] = 
		MAXIMUM(NumPart_score_array[internal_document_id], (query_wgt *  internal_weight));
#else
              NumPart_score_array[internal_document_id] += 
                (doc_id) ? (internal_weight * RF_WEIGHTING) : internal_weight;
#endif
            }
            else {
              ((search_result_array[operand_id]).doc_ids_array[count]).doc_id = internal_document_id;
#ifdef NEW_WEIGHT /* tung, 5/94 */
	      ((search_result_array[operand_id]).doc_ids_array[count]).score +=
		(query_wgt *  internal_weight);
#else
              ((search_result_array[operand_id]).doc_ids_array[count]).score +=
                (doc_id) ? (internal_weight * RF_WEIGHTING) : internal_weight;
#endif
            }
          }
        }
#endif
/* tung, 10/93 */
        
      }
/*
printf("Score array: %lf\n",document_score_array[internal_document_id]);
fflush(stdout);
*/

	/* i+=INDEX_ELEMENT_SIZE;  Purify (umr): uninitialized memory read: (up) */
      }
    }

#ifdef PARTIALWORD
#ifdef FIELDS /* tung, 1/94 */
  index_file_block_number = 
    look_up_partialword_in_dictionary(field_name, 
                                      NULL, &number_of_occurances, db);
#else
  index_file_block_number = 
    look_up_partialword_in_dictionary(NULL, &number_of_occurances, db);
#endif      
  }
#endif

#ifdef NESTED_BOOLEANS /* tung, 1/94 */
   if(number_of_operands > 1) {
     long index = 0;
#ifdef FIELDS /* tung, 1/94 */
     if(weight == NUMERIC_FLAG || weight == PARTIAL_FLAG) {
#else
     if(weight == PARTIAL_FLAG) {
#endif
       if(!make_doc_ids_array(operand_id, db->doc_table_allocated_entries))
	 return(-1);
       search_result_array[operand_id].number_of_hits =
         numeric_partial_valid_entries;
       /*for (count=0; count < db->doc_table_allocated_entries; count++) {*/
       for (count=doc_start; count <= doc_end ; count++) {
         if(NumPart_score_array[count] > 0) {
           ((search_result_array[operand_id]).doc_ids_array[index]).doc_id = count;
           ((search_result_array[operand_id]).doc_ids_array[index]).score
             = NumPart_score_array[count];
           NumPart_score_array[count] = 0.0;
           ++index;
         }
         if(index == numeric_partial_valid_entries)
           break;
       }
     }
   }
#endif

/* tung, 10/93 */
#ifdef NESTED_BOOLEANS
    if((number_of_operands > 1) && (search_result_array != NULL)) {
      if(!save_operand_id(operand_id, search_result_array,db->doc_table_allocated_entries))
	return(-1);
      search_result_array[operand_id].operand_id = operand_id;
      ++operand_id;
    }
#endif 
/* tung, 10/93 */

#ifndef NESTED_BOOLEANS /* tung, 10/94 */
#ifdef BOOLEANS
   for (count=0; count < db->doc_table_allocated_entries; count++) {   /* 12/91 GS TLG */
     if (!gLastAnd) { 							/* 12/91 GS TLG */
       prev_score_array[count] = document_score_array[count];		/* 12/91 GS TLG */
     }			 					/* 12/91 GS TLG */
     else {	   							/* 12/91 GS TLG */
       if ((document_score_array[count] == prev_score_array[count])	/* 12/91 GS TLG */
           || (prev_score_array[count] == 0)) {
         document_score_array[count] = 0;				/* 12/91 GS TLG */
         prev_score_array[count] = 0;					/* 12/91 GS TLG */
       }								/* 12/91 GS TLG */
       else {  
         prev_score_array[count] = document_score_array[count];	/* 12/91 GS TLG */
       }								/* 12/91 GS TLG */
     }									/* 12/91 GS TLG */
   }									/* 12/91 GS TLG */
/*  if (gLastAnd) printf("search_word: boolean `and' scored\n"); */
#endif
#endif 

#ifdef BOOLEANS
   gLastNot= gLastAnd= false;
#endif /* BOOLEANS */
    return(0); 
  }
  
  else if(0 == index_file_block_number){
    /* an error occurred on looking up the word */
#ifdef BOOLEANS
   gLastNot= gLastAnd= false;
#endif 
   return(-1);
 }
  
  else {				/* index_file_block_number is negative */
#ifdef NESTED_BOOLEANS /* tung, 10/93 */
    if((number_of_operands > 1) && (search_result_array != NULL)) {
      if(!save_operand_id(operand_id, search_result_array,db->doc_table_allocated_entries))
	return(-1);
      search_result_array[operand_id].operand_id = operand_id;
      search_result_array[operand_id].number_of_hits = 0;
      ++operand_id;
    }
#else
#ifdef BOOLEANS
    if (gLastAnd) 
      for (count=0; count < db->doc_table_allocated_entries; count++) {
        document_score_array[count] = 0;                       
        prev_score_array[count] = 0;   
      } 
    gLastNot= gLastAnd= false;
#endif
#endif 
    return(0);		/* word not present */
  }
}


/* now collect the best hits */
long finished_search_word(db)
     database *db;
{ 
#ifdef NESTED_BOOLEANS
  long number_of_hits; /* tung, 10/93 */
#endif

#ifdef BOOL
  if (currentQuery != NULL)
    return; /* do nothing for boolean */
#endif /* def BOOL */

/* tung, 10/93 */
#ifdef NESTED_BOOLEANS
  if((number_of_operands > 1) && (search_result_array != NULL)) {
    number_of_hits = retriev_result(db->doc_table_allocated_entries, 
                                    document_score_array);
    clear_search_result_array(&number_of_operands);
  }
#endif
/* tung, 10/93 */
  
  /* check the document_score_array */
  if(document_score_array_len < db->doc_table_allocated_entries)
    make_document_score_array(db->doc_table_allocated_entries);

  make_best_hits_array(max_hit_retrieved);
  sort_best_hits(db);
  syn_Free( db->syn_Table,&db->syn_Table_Size );

  return(0);
}

