/* WIDE AREA INFORMATION SERVER SOFTWARE:
   No guarantees or restrictions.  See the readme file for the full standard
   disclaimer.

   Brewster@think.com
*/

/* Copyright (c) CNIDR (see ../COPYRIGHT) */


#ifndef lint
static char *RCSid = "$Header: /archives/stelar/src/freeWAIS/freeWAIS-0.2/ir/RCS/waisindex.c,v 1.5 93/07/21 18:53:04 warnock Exp $";
#endif

/*
 * Building an index with a Unix shell interface.
 *
 * -brewster 6/90
 */

/* Change log:
 * added -stdio option from jik@athena.mit.edu
 * $Log:	waisindex.c,v $
 * Revision 1.5  93/07/21  18:53:04  warnock
 * Renamed from irbuild.c
 * Added STELAR-specific patches
 *
 * $Log: irbuild.c,v $
 * Revision 1.8  1993/10/12  11:18:25  pfeifer
 * Added stopword file for document style bibdb
 * 
 * Revision 1.1  93/07/19  16:30:22  warnock
 * Initial revision
 *
 * Revision 1.7  1993/09/22  16:09:13  pfeifer
 * What have i done ?
 *
 * Revision 1.4  93/07/01  19:40:31  warnock
 * Added prototype for function double
 *
 * Revision 1.6  1993/06/04  10:23:15  pfeifer
 * Pachtlevel BIBDB
 * 
 * Revision 1.3  93/02/16  17:07:49  freewais
 *
 * Revision 1.5  1993/06/02  18:29:00  pfeifer
 * Added code for local formats
 *
 * Revision 1.4  1993/06/01  14:05:54  pfeifer
 * Added code for soundex/phonix indexing and retrieval
 *
 * Revision 1.3  1993/02/16  17:07:49  freewais
 * added AT&T patches for keyword list
 * 
 * Revision 1.2  1993/02/16  15:32:21  freewais
 * added AT&T patch to write first 50 dictionary entries to
 * src file
 *
 * Revision 1.1  1993/02/16  15:05:35  freewais
 * Initial revision
 *
 * Revision 1.47  92/05/10  14:48:17  jonathan
 * Updated for release.
 * 
 * Revision 1.46  92/05/08  10:03:17  jonathan
 * Adjusted memory paramters.  It's closer...
 * 
 * Revision 1.45  92/05/06  17:26:46  jonathan
 * Added switch for indexing contents, new user-specified type name, new type:
 * filename, which only puts the name of the file in the header.
 * 
 * Revision 1.44  92/04/25  21:14:35  brewster
 * added ziff
 * 
 * Revision 1.43  92/04/22  15:29:13  jonathan
 * Added jargon to usage message.
 * 
 * Revision 1.42  92/04/01  17:08:50  jonathan
 * Added FTP type.
 * 
 * Revision 1.41  92/03/25  18:49:39  jonathan
 * Added log_level and log_file arguments.
 * 
 * Revision 1.40  92/03/22  18:38:14  brewster
 * added objective C filter
 * 
 * Revision 1.39  92/03/20  11:02:44  jonathan
 * Added code to handle switches for word_pairs and word_postition info.
 * 
 * Revision 1.38  92/03/17  07:34:32  jonathan
 * Fixed spacing in usage message.
 * 
 * Revision 1.37  92/03/10  10:42:51  morris
 * fixed small bug in command line argument handleing.  doesn't die if there
 * are no args.
 * 
 * Revision 1.36  92/03/05  07:05:32  shen
 * add cm grow percent and textsize to command line and init search engine
 * 
 * Revision 1.35  92/03/04  16:34:09  jonathan
 * Set wais_pid from getpid().
 * 
 * Revision 1.34  92/02/20  09:49:37  jonathan
 * Added bibtex and nhyp filters from S.P.vandeBurgt@research.ptt.nl.
 * 
 * Revision 1.33  92/02/17  14:21:08  jonathan
 * Added switch to disable creation of catalog (-nocat).
 * 
 * Revision 1.32  92/02/17  12:41:55  jonathan
 * Added RCSid.
 * 
 * Revision 1.31  92/02/17  12:41:01  jonathan
 * Build catalog after completion of indexing.
 * 
 * Revision 1.30  92/02/12  13:22:53  jonathan
 * Added "$Log" so RCS will put the log message in the header
 * 
 */

/* to do:
 *   done: make incremental indexing not index things that are already index
 *   add extra arg -register that will send in description of the server to 
 *           the directory of servers.
 *   done: create a source struct in the .src file
 *   make it continuously index to keep itself uptodate.
 *
 */

#include <string.h>
#include <sys/types.h>
#include <sys/param.h>
#include "irdirent.h"
#include "cutil.h"
#include "futil.h"
#include "irfiles.h"
#include "irtfiles.h"
#include "panic.h"
#include "ircfiles.h"
#include "version.h"
#include "irext.h"
#include "stoplist.h"	/* dgg */

double compare();

#ifdef BIO
#define INDEXER_DATE "2/16/93"
#else
#define INDEXER_DATE "2/16/93"
#endif
#define MAX_LINE_LENGTH 1000

extern char *keyword[50], *descript[1000];
extern short nKeys, nDesLines;
extern double compare();

/* for reporting errors, in WAIStation it is defined in CRetrievalApp.c */

extern boolean indexingForBeta;

void usage(command)
char *command;
{ /* no args */
  fprintf(stderr,"Usage: %s [-d index_filename]\n", command);
  fprintf(stderr,"          [-a] /* adding to an existing index, otherwise it erases the index */\n");
  fprintf(stderr,"          [-r] /* recursively index subdirectories */\n");
  fprintf(stderr,"          [-mem mbytes] /* number of megabytes to run this in */\n");
  fprintf(stderr,"          [-register] /* registers the database with the directory of servers.\n");
  fprintf(stderr,"                         This should be done with care. */\n");
  fprintf(stderr,"          [-export] /* uses short dbname and port 210 */\n");
  fprintf(stderr,"          [-e [file]] /* set log output to file, or /dev/null if not specified */\n");
  fprintf(stderr,"          [-l log_level] /* set log level.  0 means log nothing,\n");
  fprintf(stderr,"                            10 [the default] means log everything */\n");
  fprintf(stderr,"          [-v] /* print the version of the software */\n");
  fprintf(stderr,"          [-filter process] /* use an external document parser */\n");
  fprintf(stderr,"          [-stdin] /* read file names from stdin */\n");
  fprintf(stderr,"          [-pos | -nopos] /* include (don't include - default) word position information /*\n");
  fprintf(stderr,"          [-nopairs | -pairs] /* don't include (or include - default) word pairs /*\n");
  fprintf(stderr,"          [-nocat] /* inhibit creation of catalog /*\n");
  fprintf(stderr,"          [-contents] /* Index the contents: this is good for types that\n");
  fprintf(stderr,"                         inhibit the indexing of the contents (like gif). /*\n");
  fprintf(stderr,"          [-nocontents] /* Index only the filename, not the contents /*\n");
#ifdef BIO
  fprintf(stderr,"          [-stop stoplist_filename] /* file of common words to ignore */\n");
  fprintf(stderr,"          [-delim delimiters] /* list of word delimiter symbols */\n");
#endif 
  fprintf(stderr,"          [-keywords \"<string>\"] /* Keywords to index for each document. */\n");
  fprintf(stderr,"          [-keyword_file <filename>] /* File of keywords to index. */\n");

  fprintf(stderr,"          [-cmmem  mem%] /* percent of CM memory (CM code only) */\n");
  fprintf(stderr,"          [-T  type] /* type becomes the \"TYPE\" of the document. */\n");
/* multitype extensions */
  fprintf(stderr,"          [-M  type,type] /* for multi-type documents. */\n");
  fprintf(stderr,"          [-t	/* format of the file. if none then each file is a document */\n");
  fprintf(stderr,"             text /* simple text files, this is the default */\n");
  fprintf(stderr,"           | bibtex /* BibTeX / LaTeX format */\n");
  fprintf(stderr,"           | bio /* biology abstract format */\n");
  fprintf(stderr,"           | cmapp /* CM applications from Hypercard */\n");
  fprintf(stderr,"           | dash /* entries separated by a row of dashes */\n");
  fprintf(stderr,"           | dvi /* dvi format */\n");     
  fprintf(stderr,"           | emacsinfo /* the GNU documentation system */\n");
  fprintf(stderr,"           | first_line /* first line of file is headline */\n");
  fprintf(stderr,"           | filename /* uses only the filename part of the pathname for the title */\n");
  fprintf(stderr,"           | ftp /* special type for FTP files.  First line of file is headline */\n");
  fprintf(stderr,"           | gif /* gif files, only indexes the filename */\n");
  fprintf(stderr,"           | html /* html */\n");
  fprintf(stderr,"           | irg /* internet resource guide */\n");
  fprintf(stderr,"           | jargon /* Jargon File 2.9.8 format*/\n");
  fprintf(stderr,"           | mail_digest /* standard internet mail digest format */\n");
  fprintf(stderr,"           | mail_or_rmail /* mail or rmail or both */\n");
  fprintf(stderr,"           | medline /* medline format */\n");
  fprintf(stderr,"           | mh_bboard /* MH bulletin board format */\n");
  fprintf(stderr,"           | netnews /* netnews format */\n");
  fprintf(stderr,"           | nhyp /* ?:? hyper text format, Polytechnic of Central London */\n");
  fprintf(stderr,"           | one_line /* each line is a document */\n");
  fprintf(stderr,"           | para /* paragraphs separated by blank lines */\n");
  fprintf(stderr,"           | pict /* pict files, only indexes the filename */\n");
  fprintf(stderr,"           | ps /* postscript format */\n");
  fprintf(stderr,"           | refer /* refer format */\n");
#ifdef BIBDB
  fprintf(stderr,"           | irlist           /* irlist mail or rmail or both */\n");  
  fprintf(stderr,"           | formfeed         /* entries separated by a formfeed */\n");
  fprintf(stderr,"           | bibdb            /* steve file entries separated by a formfeed */\n");
  fprintf(stderr,"           | bibinf           /* bibinf entries separated by an empty line */\n");
#endif
  fprintf(stderr,"           | rn /* netnews saved by the [rt]?rn newsreader */\n");
  fprintf(stderr,"           | server /* server structures for the dir of servers */\n");
#ifdef NeXT
  fprintf(stderr,"           | objc /* objective-C .h and .m files */\n");
#endif /* def NeXT */
  fprintf(stderr,"           | tiff /* tiff files, only indexes the filename */\n");
  fprintf(stderr,"           | URL what-to-trim what-to-add /* URL */\n");
  fprintf(stderr,"           | object /* a structured object*/\n");
  fprintf(stderr,"           | inriadoc /* INRIA library catalog */\n");
  fprintf(stderr,"           | paradoc /* INRIA library catalog para-mode */\n ");
  fprintf(stderr,"           | fortran /* Fortran files,needs also -filter */\n");
  fprintf(stderr,"           | mime /* Like mail */\n");

#ifdef BIO
  fprintf(stderr,"           | genbank  /* GenBank flatfile format */\n");
  fprintf(stderr,"           | embl     /* EMBL flatfile format */\n");
  fprintf(stderr,"           | pir     /* PIR flatfile format */\n");
  fprintf(stderr,"           | prositedoc /* Prosite protein doc format */\n");
  fprintf(stderr,"           | prositedat /* Prosite protein dat format */\n");
  fprintf(stderr,"           | biojournal /* Bio journal TOC on bionet.journals */\n");
  fprintf(stderr,"           | redbook  /* Drosophila redbook text */\n");
  fprintf(stderr,"           | flybase  /* Drosophila Ashburner data files */\n");
  fprintf(stderr,"           | flystock  /* Drosophila stock lists */\n");
  fprintf(stderr,"           | din      /* Drosophila Info. Newsletter */\n");
#endif
#ifdef SOUND
  fprintf(stderr,"           | oneline_phonix   /* Phonebooks PHONIX */\n");
  fprintf(stderr,"           | oneline_soundex  /* Phonebooks SOUNDEX */\n");
#endif
  fprintf(stderr,"           | listserv_digest /* standard internet mail digest format */\n");
#ifdef AAS
  fprintf(stderr,"           | AAS_abstract /* AAS meeting abstracts using AAS LaTeX macros */\n");
#endif /* AAS */
#ifdef STELAR
  fprintf(stderr,"           | stelar /* stelar abstracts - third line is hl */\n");
#endif /* STELAR */
  fprintf(stderr,"          ] filename filename ...\n");
}

/* char *log_file_name = NULL; */
FILE *logfile;

extern char* keywords;           /* used in irtfiles.c */
extern char* keyword_filename;   /* used in irtfiles.c */


extern boolean index_contents;


/* This is the MAIN for building an index.
 */
void
main(argc, argv)
int argc;
char *argv[];
{
  database* db = NULL;
  long argc_copy = argc;
  char **argv_copy = argv;
  char *next_argument;
  char index_filename[1000];
  boolean adding_to_existing_index = false;
  boolean traverse_directory = false;
  boolean word_positions = false;
  boolean word_pairs = true;
  long memory_to_use = -1;
  long cm_mem_percent = 0;  /* default */
  long grow_percent = 0;  /* default */
  long text_size = 0;  /* default */
  boolean check_for_text_file = false;
  boolean register_database = false;
  boolean export_database = false;
  boolean read_files_from_stdin = false;
  boolean make_catalog = true;
  char data_filename[MAXPATHLEN];
  char *typename = NULL;  /* this is what the user said */
  long start_of_filenames;
  long hashtable_size = 1L<<16;
  long flush_after_n_words = 300000;
  char *command_name;
  char *filter_name = NULL;
  FILE *filter_process_in = NULL;
  FILE *filter_process_out = NULL;

  dataopsrec	dataops;    
  /*-------------   these go into dataops   
  boolean (*separator_function)();
  void (*header_function)();
  void (*finish_header_function)();
  long (*date_function)();
  char *type = NULL;     
  int minwordlen= 2;	 
  ---------------*/
  
    	/* dgg -- put all of these separate, datatype-specific functions & params into a record! */
  gDelimiters[0]= '\0'; /* <-- bombs ?? */
  dataops.separator_function= NULL;
  dataops.header_function= NULL;
  dataops.date_function= NULL;
  dataops.finish_header_function= NULL;
  dataops.type= "TEXT";
  dataops.indextype= NULL;
  dataops.multitype=NULL;
  dataops.addseparatorwords= false;
  dataops.extraheaderweight= true;
  dataops.repeat_weight= 1;
  dataops.minwordlen= 2;
  dataops.wordDelimiter= wordbreak_notalnum;
  dataops.delimiters= gDelimiters;
  wordDelimiter= wordbreak_notalnum;   
 
  /*------
  separator_function = NULL; 
  header_function = NULL;
  date_function = NULL;
  finish_header_function = NULL;
  type = "TEXT";  
  -------*/
  typename = "Text"; 
  

  next_argument = next_arg(&argc, &argv);
  command_name = next_argument;

  logfile = stderr;
  wais_pid = getpid();

  if(0 == argc) {
    usage(command_name);
    exit(0);
  }

#ifdef THINK_C
  strcpy(index_filename, "wais:System Folder:wais-index:index");
#else
  strcpy(index_filename, "index"); /* in the current directory */
#endif /* THINK_C */
  stop_list_file("\0");  	/* dgg */
  
  if(NULL == (next_argument = next_arg(&argc, &argv))){
    fprintf(stderr,"No arguments specified\n");
    exit(0);
  }
  while((next_argument != NULL) && '-' == next_argument[0]){
    /* then we have an argument to process */
    if((0 == strcmp("-i", next_argument)) || /* -i is for backcompatibility */
       (0 == strcmp("-d", next_argument))){
      if(NULL == (next_argument = next_arg(&argc, &argv))){
	fprintf(stderr,"Expected filename for the index\n");
	exit(0);
      	}
      strcpy(index_filename, next_argument);
      }
#ifdef BIO      
    else if (0 == strcmp("-stop", next_argument)){		/* dgg, stoplist file */
      if (NULL == (next_argument = next_arg(&argc, &argv))){
	fprintf(stderr,"Expected filename for the stoplist\n");
	exit(0);
      	}
      stop_list_file(next_argument);
      } 
    else if (0 == strcmp("-delim", next_argument)){		/* dgg, delimiters */
      if (NULL == (next_argument = next_arg(&argc, &argv))){
	fprintf(stderr,"Expected the delimiters argument\n");
	exit(0);
      	}
      strcpy(gDelimiters, next_argument);
      dataops.wordDelimiter = wordbreak_user;
      wordDelimiter = wordbreak_user;
      printf("Delimiters used in index: %s\n\n",gDelimiters);
      } 
#endif

    else if(0 == strcmp("-a", next_argument)){
      adding_to_existing_index = true;
    }
    else if(0 == strcmp("-r", next_argument)){
      traverse_directory = true;
    }
    else if(0 == strcmp("-register", next_argument)){
      register_database = true;
    }
    else if(0 == strcmp("-export", next_argument)){
      export_database = true;
    }
    else if(0 == strcmp("-v", next_argument)){
      fprintf(stderr,"%s: %s %s\n", command_name, VERSION, INDEXER_DATE);
    }
    else if (0 == strcmp("-stdin", next_argument)) {
      read_files_from_stdin = true;
    }
    else if (0 == strcmp("-nopos", next_argument)) {
      word_positions = false;
    }
    else if (0 == strcmp("-pos", next_argument)) {
      word_positions = true;
    }
    else if (0 == strcmp("-nopairs", next_argument)) {
      word_pairs = false;
    }
    else if (0 == strcmp("-pairs", next_argument)) {
      word_pairs = true;
    }
    else if (0 == strcmp("-nocat", next_argument)) {
      make_catalog = false;
    }
    else if(0 == strcmp("-mem", next_argument)){
      if(NULL == (next_argument = next_arg(&argc, &argv)))
	panic("Expected a number for the amount of memory to use");
      memory_to_use = atol(next_argument);
      if(memory_to_use < 1)
	panic("The -mem argument should not be less than 1");
      if(memory_to_use > 200)
	fprintf(stderr,"Warning: The -mem parameter was %ld Mbytes.  That is a large number of mega bytes in current machines\n", memory_to_use);
    }
    else if(0 == strcmp("-cmmem", next_argument)){
      if(NULL == (next_argument = next_arg(&argc, &argv)))
	panic("Expected a number (1-100) for percentage of memory to use");
      cm_mem_percent = atol(next_argument);
      if(cm_mem_percent < 1)
	panic("The -cmmem argument should not be less than 1 and less than 100");
      if(cm_mem_percent > 100)
	panic("Warning: The -cmmem parameter was %ld%%. It should be between 1-100.", cm_mem_percent);
    }
       else if(0 == strcmp("-filter", next_argument)){
       if(NULL == (next_argument = next_arg(&argc, &argv)))
       panic("Expected the name of a program to use to find keywords");
       filter_name=next_argument;
    }
    else if(0 == strcmp("-grow", next_argument)){
      if(NULL == (next_argument = next_arg(&argc, &argv)))
        panic("Expected a number (1-100) for database growing percentage");
      grow_percent = atol(next_argument);
      if(grow_percent < 1)
        panic("The -grow argument should not be less than 1");
    }
    else if(0 == strcmp("-textsize", next_argument)){
      if(NULL == (next_argument = next_arg(&argc, &argv)))
        panic("Expected a number for text size in megabytes");
      text_size = atol(next_argument);
      if(text_size < 1)
        panic("The -textsize argument should not be less than 1");
    }
    else if (0 == strcmp("-e", next_argument)) {
      char *peek_argument = peek_arg(&argc, &argv);
      log_file_name = "/dev/null"; /* default to /dev/null */
      if ((peek_argument != NULL) &&
	  ('-' != peek_argument[0])) {
	log_file_name = next_arg(&argc, &argv);
      }				/* end if (explicit log file) */
    }				/* end if (-e) */
    else if (0 == strcmp("-l", next_argument)) {
      wais_log_level = atol(next_arg(&argc, &argv));
    }				/* end if (-l) */
    else if(0 == strcmp("-cm", next_argument)){
      /* this is an undocumented argument to help use this to
	 front end the CM application */
      indexingForBeta = true;
    }
    else if(0 == strcmp("-T", next_argument)){
      /* This is a specification for a "Special" type.  The next argument
	 is the type name.  This will not index the body of the file. */
      if(NULL == (next_argument = next_arg(&argc, &argv)))
	panic("Expected a file type");
      typename = next_argument;
      dataops.type = next_argument;
      fprintf(stderr,"waisindex: setting type to %s\n", next_argument);
      dataops.finish_header_function = filename_finish_header_function;
    }

/* multitype extensions */
/* 
   This is a specification for a multi-type document, the types should
   be entered as a comma delimited list.  Note that this only defines
   all the types available in the database, you also need to specify a
   -t option so that the indexer knows how to parse the files. 
   One of the limitations here is that each document must
   be a file with the extension of the file being the document type, so
   the document #### has a text file ####.TEXT and a jfif file
   ####.JFIF, not real nice but needed.

   Note that this contains both the primary and secondary document
   types, whereas dataops.type contains the primary type.
*/

    else if(0 == strcmp("-M", next_argument)){
      if(NULL == (next_argument = next_arg(&argc, &argv)))
	panic("Expected a multitype list");
      dataops.multitype = next_argument;
   }

     else if(0 == strcmp("-keywords", next_argument)){
       if(NULL == (next_argument = next_arg(&argc, &argv)))
 	panic("Expected -keywords argument string");
       keywords = next_argument;
     }
     else if(0 == strcmp("-keyword_file", next_argument)){
       if(NULL == (next_argument = next_arg(&argc, &argv)))
 	panic("Expected -keyword_file filename");
       keyword_filename = next_argument;
    }

    else if(0 == strcmp("-contents", next_argument)){
      index_contents = true;
    }
    else if(0 == strcmp("-nocontents", next_argument)){
      index_contents = false;
    }
    else if(0 == strcmp("-t", next_argument)){
      /* then we have a specialized file */
      index_contents = true;
      if(NULL == (next_argument = next_arg(&argc, &argv)))
	panic("Expected a file type");
      if(0 == strcmp("groliers", next_argument)){
	typename = next_argument;
	dataops.type ="TEXT";
	dataops.separator_function = groliers_separator_function;
	dataops.header_function = groliers_header_function;
	dataops.finish_header_function = groliers_finish_header_function;
      }

#ifdef BIO 
       else if(0 == strcmp("genbank", next_argument)){/* dgg */
 	typename = next_argument;
 	dataops.type ="TEXT";
 	dataops.separator_function = genbank_separator_function;
 	dataops.header_function = genbank_header_function;
 	dataops.finish_header_function = genbank_finish_header_function;
 	dataops.date_function = genbank_date_function;
	dataops.repeat_weight= 0;
	dataops.addseparatorwords= true;
	dataops.extraheaderweight= false;
	dataops.minwordlen= 2;
       }
       else if(0 == strcmp("embl", next_argument)){/* dgg */
 	typename = next_argument;
 	dataops.type ="TEXT";
 	dataops.separator_function = embl_separator_function;
 	dataops.header_function = embl_header_function;
 	dataops.finish_header_function = embl_finish_header_function;
 	dataops.date_function = embl_date_function;
 	dataops.repeat_weight= 0;
	dataops.addseparatorwords= true;
	dataops.extraheaderweight= false;
      }
       else if(0 == strcmp("pir", next_argument)){/* dgg */
 	typename = next_argument;
 	dataops.type = "TEXT";
 	dataops.separator_function = pir_separator_function;
 	dataops.header_function = pir_header_function;
 	dataops.finish_header_function = pir_finish_header_function;
 	dataops.date_function = pir_date_function;
	dataops.repeat_weight= 0;
 	dataops.addseparatorwords= true;
	dataops.extraheaderweight= false;
      }
        else if(0 == strcmp("prositedoc", next_argument)){ /* dgg */
         typename = next_argument;
         dataops.type = "TEXT";
         dataops.separator_function = prositedoc_separator_function;
         dataops.header_function = prositedoc_header_function;
         dataops.finish_header_function = prositedoc_finish_header_function;
  	 dataops.repeat_weight= 0;
	dataops.addseparatorwords= true;
	dataops.extraheaderweight= false;
       }
       else if(0 == strcmp("prositedat", next_argument)){ /* dgg */
         typename = next_argument;
         dataops.type = "TEXT";
         dataops.separator_function = prositedat_separator_function;
         dataops.header_function = prositedat_header_function;
         dataops.finish_header_function = prositedat_finish_header_function;
  	 dataops.repeat_weight= 0;
	dataops.addseparatorwords= true;
	dataops.extraheaderweight= false;
       }
       else if(0 == strcmp("biojournal", next_argument)){ /* dgg */
         typename = next_argument;
         dataops.type = "TEXT";
         dataops.separator_function = biojournal_separator_function;
         dataops.header_function = biojournal_header_function;
         dataops.finish_header_function = biojournal_finish_header_function;
  	 dataops.repeat_weight= 0;
	dataops.addseparatorwords= true;
	dataops.extraheaderweight= false;
       }
 
       else if(0 == strcmp("redbook", next_argument)){ /* dgg */
 	typename = next_argument;
 	dataops.type = "TEXT";
 	dataops.separator_function = redbook_separator_function;
 	dataops.header_function = redbook_header_function;
 	dataops.finish_header_function = redbook_finish_header_function;
  	dataops.repeat_weight= 0;
	dataops.addseparatorwords= true;
	dataops.extraheaderweight= false;
        dataops.wordDelimiter= wordbreak_user; /* redbook_delimiter;  */
        wordDelimiter= wordbreak_user; /* wordbreak_notgraph; */
	dataops.minwordlen= 1;
	if (gDelimiters[0] == '\0') strcpy( gDelimiters, "/{}()[]%-:#.~*\";,|");
       }
       else if(0 == strcmp("flybase", next_argument)){ /* dgg */
 	typename = next_argument;
 	dataops.type = "TEXT";
 	dataops.separator_function = flybase_separator_function;
 	dataops.header_function = flybase_header_function;
 	dataops.finish_header_function = flybase_finish_header_function;
  	 dataops.repeat_weight= 0;
	dataops.addseparatorwords= true;
	dataops.extraheaderweight= false;
        dataops.wordDelimiter= wordbreak_user; /* flybase_delimiter;  */
        wordDelimiter= wordbreak_user; /* wordbreak_notgraph; */
	dataops.minwordlen= 1;
	if (gDelimiters[0] == '\0') strcpy( gDelimiters, "-/{}:.~*\";,|");
	
	/* flybase symbols
	  valid data ()$+-?;.\'
	  possible data and delimiter |;[]-?.~
	  delimiters 
	  solution to confusion: set possible delimiters as delimiters, and
	     permit literal searches with "..." or '...' enclosed strings.
	*/
	
       }
      else if(0 == strcmp("flystock", next_argument)){	/* dgg */
	typename = next_argument;
	dataops.type = "TEXT";
	dataops.separator_function = bio_separator_function;
	dataops.header_function = bio_header_function;
	dataops.finish_header_function = bio_finish_header_function;
  	 dataops.repeat_weight= 0;
	dataops.addseparatorwords= true;
	dataops.extraheaderweight= false;
        dataops.wordDelimiter= wordbreak_user; /* flybase_delimiter;  */
        wordDelimiter= wordbreak_user; /* wordbreak_notgraph; */
	dataops.minwordlen= 1;
	if (gDelimiters[0] == '\0') strcpy( gDelimiters, "-/{}:.~*\";,|");

	/* flystock symbols
	  valid data []()/-;?+.{}
	  possible data and delimiter  =;.
	    ;. in text field is del, in data field is data
	  delimiters *";,
	  more delimiters (from matthewk)  - / {} :
	  
	  solution to confusion: set possible delimiters as delimiters, and
	     permit literal searches with "..." or '...' enclosed strings.
	  ! want some way to provide field names (report "stylesheet") with
	    searched/fetched records for flybase, flystock, other data files
	  ! want "keyword [field]" limited searches for some of this to make sense !
	*/
      }

      else if(0 == strcmp("din", next_argument)){
	typename = next_argument;
	dataops.type = "TEXT";
	dataops.separator_function = din_separator_function;
	dataops.header_function = din_header_function;
	dataops.finish_header_function = din_finish_header_function;
      }

#endif 

#ifdef NeXT
      else if(0 == strcmp("objc", next_argument)){
	typename = next_argument;
	dataops.type = "TEXT";
	dataops.separator_function = wobjc_separator_function;
	dataops.header_function = wobjc_header_function;
	dataops.finish_header_function = wobjc_finish_header_function;
      }
#endif /* def NeXT */
      else if(0 == strcmp("listserv_digest", next_argument)){
 	typename = next_argument;
 	dataops.type = "TEXT";
 	dataops.separator_function = listserv_digest_separator_function;
  	dataops.header_function = listserv_header_function;
  	dataops.date_function = listserv_date_function;
  	dataops.finish_header_function = listserv_finish_header_function;
      }
#ifdef AAS
      else if(0 == strcmp("AAS_abstract", next_argument)){
 	typename = next_argument;
	dataops.separator_function = aasab_separator_function;
 	dataops.header_function = aasab_header_function;
 	dataops.finish_header_function = aasab_finish_header_function;
      }
#endif /* AAS */
#ifdef STELAR
      else if(0==strcmp("stelar",next_argument)){
        dataops.type="TEXT";
        typename=next_argument;
        dataops.separator_function=stelar_separator_function;
        dataops.header_function=stelar_header_function;
        dataops.finish_header_function=stelar_finish_header_function;
      }
#endif /* STELAR */
      else if(0 == strcmp("mail", next_argument)){
	typename = next_argument;
	dataops.type = "TEXT";
	dataops.separator_function = mail_separator_function;
	dataops.header_function = mail_header_function;
	dataops.date_function = mail_date_function;
	dataops.finish_header_function = mail_finish_header_function;
      }
      else if(0 == strcmp("mail_or_rmail", next_argument)){
	typename = next_argument;
	dataops.type = "TEXT";
	dataops.separator_function = mail_or_rmail_separator;
	dataops.header_function = mail_header_function;
	dataops.date_function = mail_date_function;
	dataops.finish_header_function = mail_finish_header_function;
      }
      else if(0 == strcmp("mail_digest", next_argument)){
	typename = next_argument;
	dataops.type = "TEXT";
	dataops.separator_function = mail_digest_separator_function;
 	dataops.header_function = mail_header_function;
 	dataops.date_function = mail_date_function;
 	dataops.finish_header_function = mail_finish_header_function;
      }
      else if(0 == strcmp("mh_bboard", next_argument)){
 	typename = next_argument;
 	dataops.type = "TEXT";
 	dataops.separator_function = mh_bboard_separator_function;
	dataops.header_function = mail_header_function;
	dataops.date_function = mail_date_function;
	dataops.finish_header_function = mail_finish_header_function;
      }
      else if(0 == strcmp("rmail", next_argument)){
	typename = next_argument;
	dataops.type = "TEXT";
	dataops.separator_function = rmail_separator_function;
	dataops.header_function = mail_header_function;
	dataops.date_function = mail_date_function;
	dataops.finish_header_function = mail_finish_header_function;
      }
      else if(0 == strcmp("netnews", next_argument)){
	typename = next_argument;
	dataops.type = "TEXT";
	dataops.separator_function = NULL;
	dataops.header_function = mail_header_function;
	dataops.date_function = mail_date_function;
	dataops.finish_header_function = mail_finish_header_function;
      }
      else if(0 == strcmp("rn", next_argument)){
	typename = next_argument;
	dataops.type = "TEXT";
	dataops.separator_function = rn_separator_function;
	dataops.header_function = mail_header_function;
	dataops.date_function = mail_date_function;
	dataops.finish_header_function = mail_finish_header_function;
      }
#ifdef BIBDB
      else if(0 == strcmp("irlist", next_argument)){
	typename = next_argument;
	dataops.type = "TEXT";
	dataops.separator_function = irlist_separator_function;
	dataops.header_function = irlist_header_function;
	dataops.date_function = irlist_date_function;
	dataops.finish_header_function = mail_finish_header_function;
      }
      /* formfeed-separated items , Intro to Algorithms buglist, etc */
      else if(0 == strcmp("formfeed", next_argument)){
	typename = next_argument;
	if (!dataops.type || (strlen(dataops.type)==0)) {
          if (dataops.type)
            fprintf(stderr, "irbuild: overwriting type %s\n", dataops.type);
          dataops.type = "TEXT";
        } else {
          fprintf(stderr, "irbuild: using type %s\n", dataops.type);
        }
	dataops.separator_function = formfeed_separator_function;
	dataops.header_function = dash_header_function;
	dataops.finish_header_function = dash_finish_header_function;
      }
      /* formfeed-separated items , steve files */
      else if(0 == strcmp("bibdb", next_argument)){
	typename = next_argument;
	if (!dataops.type || (strlen(dataops.type)==0)) {
          if (dataops.type)
            fprintf(stderr, "irbuild: overwriting type %s\n", dataops.type);
          dataops.type = "TEXT";
        } else {
          fprintf(stderr, "irbuild: using type %s\n", dataops.type);
          stop_list_file("bibdb.stop");
        }
	dataops.separator_function = bibdb_separator_function;
	dataops.header_function = bibdb_header_function;
	dataops.date_function = bibdb_date_function;
	dataops.finish_header_function = bibdb_finish_header_function;
      }
      /* formfeed-separated items, bibinbf */
      else if(0 == strcmp("bibinf", next_argument)){
	typename = next_argument;
        dataops.type = "TEXT";
        dataops.separator_function = bibinf_separator_function;
        dataops.header_function = bibinf_header_function;
#ifdef SIMPLE_BIBINF
        dataops.date_function = bibinf_date_function;
#endif
        dataops.finish_header_function = bibinf_finish_header_function;
      }
#endif
      else if(0 == strcmp("emacsinfo", next_argument)){
	typename = next_argument;
	dataops.type = "TEXT";
	dataops.separator_function = emacs_info_separator_function;
	dataops.header_function = emacs_info_header_function;
	dataops.finish_header_function = emacs_info_finish_header_function;
      }
      else if(0 == strcmp("catalog", next_argument)){
	typename = next_argument;
	dataops.type = "TEXT";
	dataops.separator_function = catalog_separator_function;
	dataops.header_function = catalog_header_function;
	dataops.finish_header_function = catalog_finish_header_function;
      }
      else if(0 == strcmp("bio", next_argument)){
	typename = next_argument;
	dataops.type = "TEXT";
	dataops.separator_function = bio_separator_function;
	dataops.header_function = bio_header_function;
	dataops.finish_header_function = bio_finish_header_function;
      }
      else if(0 == strcmp("cmapp", next_argument)){
	typename = next_argument;
	dataops.type = "TEXT";	
	dataops.separator_function = cmapp_separator_function;
	dataops.header_function = cmapp_header_function;
	dataops.finish_header_function = cmapp_finish_header_function;
      }
      else if(0 == strcmp("ftp", next_argument)){
	dataops.type = "TEXT-FTP";
	typename = next_argument;
	dataops.separator_function = first_line_separator_function;
	dataops.header_function = first_line_header_function;
	dataops.finish_header_function = first_line_finish_header_function;
      }
      else if(0 == strcmp("jargon", next_argument)){
	typename = next_argument;
	dataops.type = "TEXT";
	dataops.separator_function = jargon_separator_function;
	dataops.header_function = jargon_header_function;
	dataops.finish_header_function = jargon_finish_header_function;
      }
      else if(0 == strcmp("server", next_argument)){
	typename = next_argument;
	dataops.type = "WSRC";
	dataops.finish_header_function = filename_finish_header_function;
      }
      else if(0 == strcmp("text", next_argument)){
	dataops.type = "TEXT";
	typename = next_argument;
	check_for_text_file = true;
      }
      else if(0 == strcmp("filename", next_argument)){
	dataops.type = "TEXT";
	typename = next_argument;
	dataops.finish_header_function = filename_finish_header_function;
      }
      /* html format */
      else if(0 == strcmp("html", next_argument)){
        dataops.type = "HTML";
        typename = next_argument;
        dataops.separator_function = html_separator_function;
        dataops.header_function = html_header_function;
        dataops.finish_header_function = html_finish_header_function;
      }
      else if(0 == strcmp("irg", next_argument)){
	typename = next_argument;
	dataops.type = "TEXT";
	dataops.separator_function = irg_separator_function;
	dataops.header_function = irg_header_function;
	dataops.finish_header_function = irg_finish_header_function;
      }
      /* dash-separated items , Intro to Algorithms buglist, etc */
      else if(0 == strcmp("dash", next_argument)){
	dataops.type = "TEXT";
	typename = next_argument;
	dataops.separator_function = dash_separator_function;
	dataops.header_function = dash_header_function;
	dataops.finish_header_function = dash_finish_header_function;
      }
      /* one_line-separated items */
      else if(0 == strcmp("one_line", next_argument)){
	dataops.type = "TEXT";
	typename = next_argument;
	dataops.separator_function = one_line_separator_function;
	dataops.header_function = one_line_header_function;
	dataops.finish_header_function = one_line_finish_header_function;
      }
      /* blank line-separated items (paragraphs) */
      else if(0 == strcmp("para", next_argument)){
	dataops.type = "TEXT";
	typename = next_argument;
	dataops.separator_function = para_separator_function;
	dataops.header_function = para_header_function;
	dataops.finish_header_function = para_finish_header_function;
      }
      /* seeker items */
      else if(0 == strcmp("seeker", next_argument)){
	dataops.type = "TEXT";
	typename = next_argument;
	dataops.separator_function = seeker_separator_function;
	dataops.header_function = seeker_header_function;
	dataops.finish_header_function = seeker_finish_header_function;
      }
      /* medline format */
      else if(0 == strcmp("medline", next_argument)){
	dataops.type = "TEXT";
	typename = next_argument;
	dataops.separator_function = medline_separator_function;
	dataops.header_function = medline_header_function;
	dataops.finish_header_function = medline_finish_header_function;
      }
      /* refer format */
      else if(0 == strcmp("refer", next_argument)){
	dataops.type = "TEXT";
	typename = next_argument;
	dataops.separator_function = refer_separator_function;
	dataops.header_function = refer_header_function;
	dataops.finish_header_function = refer_finish_header_function;
      }
      /* first_line format */
      else if(0 == strcmp("first_line", next_argument)){
	dataops.type = "TEXT";
	typename = next_argument;
	dataops.separator_function = first_line_separator_function;
	dataops.header_function = first_line_header_function;
	dataops.finish_header_function = first_line_finish_header_function;
      }
      /* rlin items */
      else if(0 == strcmp("rlin", next_argument)){
	dataops.type = "TEXT";
	typename = next_argument;
	dataops.separator_function = rlin_separator_function;
	dataops.header_function = rlin_header_function;
	dataops.finish_header_function = rlin_finish_header_function;
      }
      else if(0 == strcmp("dvi", next_argument)){
	typename = next_argument;
	dataops.type = "DVI";
	dataops.finish_header_function = filename_finish_header_function;
      }
      else if(0 == strcmp("ps", next_argument)){
	typename = next_argument;
	dataops.type = "PS";
	dataops.finish_header_function = filename_finish_header_function;
      }
      else if(0 == strcmp("pict", next_argument)){
	typename = next_argument;
	dataops.type = "PICT";	
	dataops.finish_header_function = filename_finish_header_function;
	index_contents = false;
      }
      else if(0 == strcmp("gif", next_argument)){
	typename = next_argument;
	dataops.type = "GIF";	
	dataops.finish_header_function = filename_finish_header_function;
	index_contents = false;
      }
      else if(0 == strcmp("tiff", next_argument)){
	typename = next_argument;
	dataops.type = "TIFF";	
	dataops.finish_header_function = filename_finish_header_function;
	index_contents = false;
      }
      else if(0== strcmp("object", next_argument)) {
        dataops.type = "OBJECT";
        typename = next_argument;
      }
      else if(0 == strcmp("inriadoc", next_argument)){
        typename = next_argument;
        dataops.type = "TEXT";
        dataops.separator_function = NULL;
        dataops.header_function = inriadoc_header_function;
        dataops.date_function = NULL;
        dataops.finish_header_function = inriadoc_finish_header_function;
      }
      else if(0 == strcmp("fortran", next_argument)){
        typename = next_argument;
        dataops.type = "FORTRAN";
      }
      else if(0 == strcmp("paradoc", next_argument)){
        typename = next_argument;
        dataops.type = "TEXT";
        dataops.separator_function = para_separator_function;
        dataops.header_function = inriadoc_header_function;
        dataops.date_function = NULL;
        dataops.finish_header_function = inriadoc_finish_header_function;
      }
      else if(0 == strcmp("mime", next_argument)){
        typename = next_argument;
        dataops.type = "MIME";
        dataops.separator_function = mail_separator_function;
        dataops.header_function = mail_header_function;
        dataops.date_function = mail_date_function;
        dataops.finish_header_function = mail_finish_header_function;
      }
      /* BibTeX items */
      else if(0 == strcmp("bibtex", next_argument)){
	dataops.type = "TEXT";
	typename = next_argument;
	dataops.separator_function = bibtex_separator_function;
	dataops.header_function = bibtex_header_function;
	dataops.finish_header_function = bibtex_finish_header_function;
      }
      /* ?:? seperated hypertext items */
      else if(0 == strcmp("nhyp", next_argument)){
	dataops.type = "TEXT";
	typename = next_argument;
	dataops.separator_function = nhyp_separator_function;
	dataops.header_function = nhyp_header_function;
	dataops.finish_header_function = nhyp_finish_header_function;
      }
      /* Uniform Resource Locators  - from Nat Torkington */
      else if(0 == strcmp("URL", next_argument)) {
        dataops.type = "URL";
        typename = next_argument;
        URL_trim = s_strdup(next_arg(&argc, &argv));
        URL_prefix = s_strdup(next_arg(&argc, &argv));
      }
      else if(0 == strcmp("ziff", next_argument)){
	dataops.type = "TEXT";
	typename = next_argument;
	dataops.separator_function = ziff_separator_function;
	dataops.header_function = ziff_header_function;
	dataops.finish_header_function = ziff_finish_header_function;
      }
#ifdef SOUND
      else if(0 == strcmp("oneline_soundex", next_argument)){
	dataops.indextype = "SOUNDEX";
        dataops.type = "TEXT";
	typename = next_argument;
	dataops.separator_function = one_line_separator_function;
	dataops.header_function = one_line_header_function;
	dataops.finish_header_function = one_line_finish_header_function;
      }
      else if(0 == strcmp("oneline_phonix", next_argument)){
	dataops.indextype = "PHONIX";
        dataops.type = "TEXT";
	typename = next_argument;
	dataops.separator_function = one_line_separator_function;
	dataops.header_function = one_line_header_function;
	dataops.finish_header_function = one_line_finish_header_function;
      }
#endif
      else{
	panic("Don't recognize the '%s' type", next_argument);
      }
    }
    else{
      panic("Don't recognize the '%s' option", next_argument);
    }
    next_argument = next_arg(&argc, &argv);
    if (! (read_files_from_stdin || next_argument)) {
      fprintf(stderr,"No files specified\n");
      exit(0);
    }
  }
  start_of_filenames = argc_copy - argc - 1;

  /* check index */
  if(0 == strlen(pathname_name(index_filename))){
    waislog(WLOG_HIGH, WLOG_ERROR,
	    "The pathname specified for the destination of the index files ('%s') should have a leaf filename without an extention rather than just a directory.",
	    index_filename);
    exit(0);     
  }
    
  waislog(WLOG_MEDIUM, WLOG_INDEX, "Starting to build database %s",
	  index_filename);

  if(0 != init_search_engine(index_filename, false, false, cm_mem_percent,
          text_size, grow_percent))
    panic("unable to initialize search engine");

  if(true == adding_to_existing_index){
    db = openDatabase(index_filename, false, false);
    if (db == NULL){ /* does not exist, create one */
      db = openDatabase(index_filename, true, false);
      if (db == NULL)
	panic("unable to open the database");
    }
  }
  else{
    db = openDatabase(index_filename, true, false);
    if (db == NULL)
      panic("unable to open the database");
  }
  
#ifdef BIO
    write_delimiters(gDelimiters, db);
#endif

  { /* set up the memory hashtable */

    if(memory_to_use < 0){ /* default */
      /* do nothing */
    }
    else if(memory_to_use <= 2){
      hashtable_size = 1L<<16;
      flush_after_n_words = 50000;
    }

    else if(memory_to_use <= 3){
      hashtable_size = 1L<<16;
      flush_after_n_words =850000;
    }
    else if(memory_to_use <= 4){
      hashtable_size = 1L<<16;
      flush_after_n_words = 110000;
    }
    else if(memory_to_use <= 5){
      hashtable_size = 1L<<16;
      flush_after_n_words = 150000;
    }

    else if(memory_to_use <= 10){
      /* shown to take about 6MB on a sun4, when it is dict limited */
      hashtable_size = 1L<<16;
      flush_after_n_words = 300000;
    }
    else if(memory_to_use <= 20){
      hashtable_size = 1L<<17;
      flush_after_n_words = 600000;
    }
    else{ /* over 20 Mbytes */
      hashtable_size = 1L<<18;
      flush_after_n_words = 1000000;
    }
/* Set up the filter process, if needed */
     /* We do this before initing the hash table to stop the fork copying a
        load of rubbish*/
   if(filter_name) {
     int to_handles[2];
     int from_handles[2];
     int pid;
     extern int errno;
     if (pipe(to_handles) <0) {
       panic("can't open to pipe");
     }
     if (pipe(from_handles) <0) {
       panic("can't open from pipe");
     }

     if((pid = fork()) ==0) {

       /* child */

       close(0);
       close(1);
       close(2);
       dup(to_handles[0]);
       dup(from_handles[1]);         /* Set up standard input/output/error */
       dup(from_handles[1]);
       close(to_handles[0]);
       close(to_handles[1]);
       close(from_handles[0]);
       close(from_handles[1]);

       if(execl(filter_name,filter_name,NULL) == -1) {
       exit(errno);
       }
       /*NOTREACHED*/
 }    
     /* parent */

    if (pid <0) {
       panic("Couldn't fork");
     }
     close(to_handles[0]);
     close(from_handles[1]);

     filter_process_in=fdopen(to_handles[1],"w");
     filter_process_out=fdopen(from_handles[0],"r");
     waislog(WLOG_LOW, WLOG_INDEX, "Filter %s started (%d)",filter_name,pid);
   }

    init_add_word(db, hashtable_size, flush_after_n_words);
  }

  if (read_files_from_stdin) {
    if (0 != (next_argument = fgets(data_filename, MAXPATHLEN, stdin))) {
      int len = strlen(next_argument);
      if (next_argument[len-1] == '\n') {
	next_argument[len-1] = '\0';
      }
    }
  }

  while(NULL != next_argument){ /* the first filename is in next_argument already */
    if(directoryp(next_argument)){
       if(traverse_directory){
	 index_directory(next_argument, &dataops, db,
			 check_for_text_file,
			 adding_to_existing_index,
			 word_positions, word_pairs,
			 filter_process_in,filter_process_out);
/*	 index_directory(next_argument,
			 separator_function,
			 header_function,
			 date_function,
			 finish_header_function,
			 type, db,
			 check_for_text_file,
			 adding_to_existing_index, 
			 word_positions, word_pairs, minwordlen); */
       }
     }
    else{			/* not a directory */
      waislog(WLOG_MEDIUM, WLOG_INDEX, 
	      "Indexing file: %s", next_argument);
      index_text_file(next_argument, &dataops, db, 
		      check_for_text_file, adding_to_existing_index,
		      word_positions, word_pairs,
		      filter_process_in,filter_process_out);
/*      index_text_file(next_argument,
		      separator_function,
		      header_function,
		      date_function,
		      finish_header_function,
		      type, db, 
		      check_for_text_file, adding_to_existing_index,
		      word_positions, word_pairs, minwordlen); */
    }
    if (read_files_from_stdin) {
      if (0 != (next_argument = fgets(data_filename, MAXPATHLEN, stdin))) {
	int len = strlen(next_argument);
	if (next_argument[len-1] == '\n') {
	  next_argument[len-1] = '\0';
	}
      }
    }
    else {
      next_argument = next_arg(&argc, &argv);
    }
  }
  finished_add_word(db);
  retreive_keywords(db);
  {
    char filename[MAX_FILENAME_LEN + 1];
    if(!probe_file(source_filename(filename, db))){
      char database_name[MAX_FILENAME_LEN];
      write_src_structure(source_filename(filename, db),
			  export_database?pathname_name(index_filename):
    			      truename(index_filename, database_name),
			  typename,
			  &argv_copy[start_of_filenames],
			  argc_copy - start_of_filenames,
			  export_database,
			  210L);
    }else{
       char *oldkeys[50];
       short oldKeys;
       if ((oldKeys = read_src_structure(source_filename(filename, db),
oldkeys))) {

       if (compare(keyword, nKeys, oldkeys, oldKeys) > 0.1) {
         char database_name[MAX_FILENAME_LEN];
         waislog(WLOG_MEDIUM,WLOG_INDEX, 
	"Keyword comparison indicates significant change.");
         waislog(WLOG_MEDIUM,WLOG_INDEX, "Rewriting source description.");
         waislog(WLOG_MEDIUM,WLOG_INDEX, 
"New source description should be exported.");
         write_src_structure(source_filename(filename, db),
			export_database?pathname_name(index_filename):
                             truename(index_filename, database_name),
                             typename,
                             &argv_copy[start_of_filenames],
                             argc_copy - start_of_filenames,
			export_database, 210L);
       }
       } else {
       char database_name[MAX_FILENAME_LEN];
       waislog(WLOG_MEDIUM,WLOG_INDEX, "No keyword list found.");
       waislog(WLOG_MEDIUM,WLOG_INDEX, "Rewriting source description.");
       waislog(WLOG_MEDIUM,WLOG_INDEX, 
"New source description should be export ed.");
       write_src_structure(source_filename(filename, db),
export_database?pathname_name(index_filename):
                           truename(index_filename, database_name),
                           typename,
                           &argv_copy[start_of_filenames],
  argc_copy - start_of_filenames,
                       export_database,
                           210L);
       }
}
    /* write out a description of the server if appropriate */
    if(register_database){
      register_src_structure(source_filename(filename, db));
    }
  }
  if(make_catalog) build_catalog(db);
  closeDatabase(db);
  /* wait for filter process to die, if there was one*/

   if(filter_process_in) {
     fprintf(filter_process_in,"Q\n");
     fflush(filter_process_in);
     fclose(filter_process_out);
     fclose(filter_process_in);
     waislog(WLOG_LOW, WLOG_INDEX, "Filter %s Exited (%ld)",filter_name,wait(0L));
   }

  waislog(WLOG_MEDIUM, WLOG_INDEX, "Finished build");
  exit(0);
}

 read_src_structure(filename, output)
 char *filename;
 char *output[50];
 {
   FILE *source_stream = s_fopen(filename, "r");
   char line[MAX_LINE_LENGTH], *ptr;
   int keyflag = 0, linelen, i, index;
   int desflag = 0;
	int tmp;
 
   index = 0;
  while (fgets(line, MAX_LINE_LENGTH, source_stream)) {
     linelen = strlen(line);
     if (keyflag) {
       if (!strncmp("                  )", line, (linelen > 19) ? 19 : linelen))
 	keyflag = 0;
       else {
 	line[strlen(line)-1] = '\0'; /* get rid of trailing return */
 	ptr = line;		/* parse keyword */
 	while (*ptr == ' ')
 	  ptr++;
 	output[index] = malloc(strlen(ptr)+1);
 	strcpy(output[index], ptr);
 	index++;
       }
     }
     if (!strncmp("   :keyword-list (", line, (linelen > 18) ? 18 : linelen))
       keyflag = 1;
     if (!strncmp("   :description", line, (linelen > 15) ? 15 : linelen))
       desflag = 1;
     if (desflag) {
	tmp=strlen(line)+1;
       descript[nDesLines] = malloc(tmp);
       strcpy(descript[nDesLines], line);
       nDesLines++;
       if (*line == '\"')
 	desflag = 0;
     }
   }
 	fclose(source_stream);
   return(index);
 }
 
 double compare(a, alen, b, blen)
 char *a[50], *b[50];
 short alen, blen;
 {
   int changes = 0;
   int i, j;
   for (i=0; i<alen; i++) {
     for (j=0; j<blen; j++)
       if (!strcmp(a[i], b[j]))
         break;
     if (j == blen)
       changes++;
   }
   printf("%d out of %d\n", changes, alen); /* info stuff */
   return((double) changes/alen);
 }
