/* WIDE AREA INFORMATION SERVER SOFTWARE:
   No guarantees or restrictions.  See the readme file for the full standard
   disclaimer.

   Brewster@think.com
 *
 * $Log: irfiles.h,v $
 * Revision 1.2  1993/06/01  14:05:54  pfeifer
 * Added code for soundex/phonix indexing and retrieval
 *
 * Revision 1.1  1993/02/16  15:05:35  freewais
 * Initial revision
 *
 * Revision 1.19  92/04/16  20:04:44  morris
 * small fix to dictionary_blockword_occurances, lenght read was
 * NEXT_INDEX_BLOCK_SIZE, now its NUMBR_OF_OCCURANCES_SIZE.
 * 
 * Revision 1.18  92/03/19  09:34:08  morris
 * fixed the dictionary header to accurately indicate the number of blocks
 * 
 * Revision 1.17  92/02/17  12:38:00  jonathan
 * Added defines for catalog.
 * 
 */

/* Copyright (c) CNIDR (see ../COPYRIGHT) */


/* include file for irfiles.c */

#ifndef IRFILES_H
#define IRFILES_H

#include "cdialect.h"
#include "cutil.h"
#include "hash.h"
#include "ustubs.h" /* for time_t */
#include "synonym.h"

/* filename extensions for various components */
#define dictionary_ext			".dct"
#define filename_table_ext		".fn"
#define headline_table_ext		".hl"
#define document_table_ext		".doc"
#define index_ext			".inv"
#define source_ext 			".src"
#define catalog_ext 			".cat"
#define synonym_ext			".syn"
#ifdef BIO
#define delimiters_ext	".dlm"	    /* dgg */
#endif

/* these dictionary definitions are used in irhash,irverify, and irfiles */
#define DICTIONARY_HEADER_SIZE 4
#define DICTIONARY_BLOCK_SIZE 1000L  /* in entries, not bytes */
#define DICTIONARY_ENTRY_HASH_CODE_SIZE 2
/* #define DICTIONARY_ENTRY_COUNT_SIZE 3  moved to inverted file */
/* #define DICTIONARY_ENTRY_INDEX_BLOCK_SIZE 4 not used and too long a symbol*/
/* #define DICTIONARY_ELEMENT_SIZE 6 was 9 */
#define DICTIONARY_SIZE 524288L
#define DICTIONARY_TOTAL_SIZE_WORD "{}" /* the word that holds the total number of words in the whole dictionary */

#define INDEX_HEADER_SIZE 4
#ifdef BIO

/* !! Bug in W8B5 -- Increasing this INDEX_BLOCK_SIZE_SIZE above 2 now fails !! */
/* it worked in W8B3 ... both 3 and 4 fail now */
#define INDEX_BLOCK_SIZE_SIZE 2  /* was 2, genbank wants 3, dgg */

#else
#define INDEX_BLOCK_SIZE_SIZE 2
#endif

#define NEXT_INDEX_BLOCK_SIZE 4
#define INDEX_BLOCK_FLAG_SIZE 1

/* dgg -- this is a bug 
#define INDEX_BLOCK_HEADER_SIZE 7
This == 7 only if the component SIZEs don't change
*/
#define INDEX_BLOCK_HEADER_SIZE (INDEX_BLOCK_SIZE_SIZE+NEXT_INDEX_BLOCK_SIZE+INDEX_BLOCK_FLAG_SIZE)


#define NUMBER_OF_OCCURANCES_SIZE 4
#define INDEX_BLOCK_NOT_FULL_FLAG 101
#define INDEX_BLOCK_FULL_FLAG 69
#define INDEX_BLOCK_DICTIONARY_FLAG 123

#define DOCUMENT_ID_SIZE 4
#define WORD_POSITION_SIZE 0
#define CHARACTER_POSITION_SIZE 3
#define WEIGHT_SIZE 1
#define INDEX_ELEMENT_SIZE 8
#define WORD_ID_SIZE 4 /* for posting arrays */

#ifdef BOOLEANS			/* dgg */
#define BOOLEAN_AND	"and"	/* may prefer "&", but need symbol fix */
#define BOOLEAN_NOT "not"	/* may prefer "!", but need symbol fix */
#define BOOLEAN_NOT_FLAG -91	/* stick in weight param as flag for search_word */
#endif

#ifdef PARTIALWORD		/* dgg */
#define PARTWORD_WILDCARD  '*'		
#endif

#ifdef LITERAL			/* dgg */
#define LITERAL_KEY1	'"'	
#define LITERAL_KEY2	0x27    /* single quote ' dgg */
#define LITERAL_FLAG	-92	/* stick in weight param as flag for search_word */
#define MAX_PHRASE_LENGTH  200
#endif

#ifdef SOUND
#define SOUNDEX "soundex"
#define PHONIX  "phonix"
#endif

typedef struct database {
	char*	database_file;
	FILE*	dictionary_stream;
	FILE*	filename_table_stream;
	FILE*	headline_table_stream;
	FILE*	document_table_stream;
	FILE*	index_stream;
#ifdef BIO
	FILE*	delimiters_stream;
#endif
  long	doc_table_allocated_entries;
	hashtable* the_word_memory_hashtable;

	long 	number_of_words_in_hashtable; /* for building.
						 checked on every add_word.
					       set at start of building,
					       and on every flush.*/
	long 	flush_after_n_words; /* set at the start of building used
					to compare with 
					number_of_words_in_hashtable. */
	long 	number_of_words; /* for building.  number of different words.
				    Set from the headers of .inv files
				    as they are merged. 
				    It is used to set the header when a .inv 
				    file is first created (not by merging).
				    */
	long	index_file_number; /* for building. */
	long    total_word_count; /* Total number of word occurances.
				     set during indexing, saved in 
				     dictionary under 'ALL' entry */
	void*   ext_database;
 t_Synonym* syn_Table;       /* synonym index lookup table */
        int     syn_Table_Size;     /* number of entries in synonym table */
} database;

typedef struct document_table_entry {
	long	filename_id;
	long	headline_id;
	long	source_id;	/* for signature system */
	long	start_character;
	long	end_character;
	long 	document_length; /* in characters */
	long	number_of_lines; /* in lines */
	time_t  date;            /* 0 if unknown */
} document_table_entry;

#ifdef __cplusplus
/* declare these as C style functions */
extern "C"
	{
#endif /* def __cplusplus */

database* 	openDatabase _AP((char* name, boolean initialize,boolean for_search));
void		closeDatabase _AP((database* the_db));
void		disposeDatabase _AP((database* the_db));

void initialize_index_files _AP((database* db));

char *read_filename_table_entry _AP((long position, 
				  char* filename,
				  char* type, 
				  time_t* file_write_date,
				  database* db));

long write_filename_table_entry _AP((char* filename, char *type, database* db));
boolean filename_in_database _AP((char *filename, char *type,
				  time_t *write_file_date, database *db));
boolean filename_in_filename_file _AP ((char *filename, char*type,
				  time_t *file_write_date, 
				  char* filename_file));
char *read_headline_table_entry _AP((long position,database* db));
long write_headline_table_entry _AP((char* headline, database* db));

#ifdef BIO
char *read_delimiters _AP((database* db));
long write_delimiters _AP((char* delimiters, database* db));
#endif

boolean read_document_table_entry 
  _AP((document_table_entry* doc_entry,long number,database* db));

long write_document_table_entry
  _AP((document_table_entry* doc_table_entry, database* db));

boolean writeUserValToDocIDTable _AP((unsigned long userVal,long doc,
				      database* db));


long next_document_id _AP((database* db));


void close_dictionary_file _AP((database *db));

long add_word_to_dictionary
	 _AP((char *word, long index_file_block_number, long number_of_occurances,
	 database* db));
#ifdef PARTIALWORD
long look_up_partialword_in_dictionary _AP((char *word, long *word_id, database* db));
#endif
long look_up_word_in_dictionary _AP((char *word, long *word_id, database* db));
long init_dict_file_for_writing _AP((database *db));
void init_dict_file_detailed _AP((FILE* dictionary_stream,
				  long number_of_blocks));
void record_num_blocks_in_dict _AP((FILE* dictionary_stream,
				    long number_of_words));

long finished_add_word_to_dictionary _AP((database *db));

boolean register_src_structure _AP((char *filename));
boolean write_src_structure _AP((char *filename, 
				 char *database_name, 
				 char *typename,
				 char **filenames, 
				 long number_of_filename,
				 boolean export_database,
				 long tcp_port));

boolean build_catalog _AP((database* db));

long allocate_index_block _AP((long how_large, FILE* stream));

unsigned char *read_dictionary_block _AP((unsigned char* block,
					  long position,long length,
					  FILE* stream));
				     				 
void print_dictionary _AP((database* db));

#define DICTIONARY_ENTRY_SIZE 29 /* sum of MAX_WORD_LENGTH, 1 ('\0'), 
				    NEXT_INDEX_BLOCK_SIZE and
				    NUMBER_OF_OCCURANCES_SIZE */


#ifdef DICT_FUNC

char *dictionary_block_word _AP((long i,unsigned char* block));
long dictionary_block_position _AP((long i,unsigned char* block));
long dictionary_block_word_occurances _AP((long i,unsigned char* block));

#else /* macros */

#define dictionary_block_word(i,block) \
  ((char *)((block) + ((i) * DICTIONARY_ENTRY_SIZE)))

#define dictionary_block_position(i,block) \
  read_bytes_from_memory(NEXT_INDEX_BLOCK_SIZE, \
			 (block) + ((i) * DICTIONARY_ENTRY_SIZE) + \
			  MAX_WORD_LENGTH + 1)

#define dictionary_block_word_occurances(i,block) \
  read_bytes_from_memory(NUMBER_OF_OCCURANCES_SIZE, \
                         (block) + ((i) * DICTIONARY_ENTRY_SIZE) + \
                         MAX_WORD_LENGTH + 1 + NEXT_INDEX_BLOCK_SIZE)
#endif

void print_dictionary_block _AP((unsigned char* block,long size));

/* database functions */
char* dictionary_filename _AP((char* destination, database* db));
char* filename_table_filename _AP((char* destination, database* db));
char* headline_table_filename _AP((char* destination, database* db));
char* document_table_filename _AP((char* destination, database* db));
char* index_filename _AP((char* destination, database* db));
char* index_filename_with_version _AP((long version, char* destination, 
				  database* db));
char* source_filename _AP((char* destination, database* db));
#ifdef BIO
char* delimiters_filename _AP((char* destination, database* db));
#endif

#ifdef __cplusplus
	}
#endif /* def __cplusplus */

#endif /* IRFILES_H */
