/* index sequential files, producing .idx and .sel files */
/* Bruce Tanner - Cerritos College */

/* Version History:
1.0  05/10/93 Original attempt
2.0  06/20/93 Create indexed files directly, add keyword count field
2.1  07/08/93 Change the file name for NOISE_WORDS
2.2  07/08/93 Move the range end (end_pos) to before the terminator
2.2jlw 07/14/93 - JLW added length spec to dash, added additional topic
                  divider keywords
2.3  07/19/93 Set multi-buffer, multi-block counts, read-ahead, write-behind
              and deferred write; noticeably increased performance
2.4  07/26/93 Removed index name, added CLI$ interface, added /TOPIC
2.4jlw 07/27/93 fixed version retention, which was broken
2.5  07/27/93 Selector strings forced to lowercase; use a good copy
2.6  07/29/93 revamp /TOPIC syntax to include text, size, exclude
2.7  07/30/93 make SIZE=n pad as well as truncate field width
2.8  08/03/93 take wildcard input file names, add /OUTPUT, /VERSION
2.9  08/05/93 JLW changed filename sizes from 80 to 256 characters
2.10 08/05/93 add check for max number of topics, reformat code
2.11 08/24/93 JLW added specific statuses for exit errors
2.12 10/01/93 add /NODEFAULT_TOPIC to omit topics that have no topic keyword
2.13 11/03/93 add /LINK to generate .link file instead of .idx/.sel
2.14 11/15/93 add /NOISE=file to specify the noise words file
2.15 11/17/93 add /TOPIC=(position), /FIELD=(position, size), /PUNCTUATION
2.16 11/18/93 fix illegal strcpy for AXP, add /MAX_TOPICS
2.17 11/21/93 make load_noise friendlier, add /NOPUNCTUATION support
2.18 11/27/93 add /MINIMUM_WORD, /COUNT_WORDS
2.19 11/30/93 fix broken /TOPIC
*/
 
#include <ssdef.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <rms.h>
#include <descrip.h>
#include <climsgdef.h>

#define CHUNK 50     /* increment to expand table of words */
#define DESC_SIZE 70 /* maximum size of a topic description */
#define SELECTOR_SIZE 100 /* maximum size of a selector (minus description) */
#define TOPIC_SIZE 20 /* maximum number of topics to list */
#define PUNCT_CHARS  "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"

typedef struct {
    char *text;
    char *found;
    int  pos;
    int  size;
    int  exclude;
} topic_str;

int words_index, words_size;
char **words;
int noise_index, noise_size;
char **noise;
int max_word, max_topic, max_count, count_words;
char *keyword, *idx_record;
char sel_record[DESC_SIZE + SELECTOR_SIZE + 100];

void build_words(char *, char *, int);
void expand_table(char ***, int *);
void write_words(FILE *, FILE *, struct RAB *, struct RAB*, int *, int *, int *,
                 char *, topic_str *);
void load_noise(char *);
int is_noise(char *, int, int);
int is_punct(char, char *);
struct dsc$descriptor_s *descr(char *string);
void parse_topic(char *, topic_str *);
void *my_realloc(void *, int);
void index_commands();
int lib$get_foreign(), lib$get_input();


main(int argc, char *argv[])
{

    FILE  *src, *lnk;
    char  *cp, *cp2, *ptr, desc[DESC_SIZE + 1], src_line[256];
    static char cli_input[256], punctuation[128], temp_punct[128];
    static char value[20], file_arg[256], file_spec[256], out_name[256];
    char  orig_line[256], spaces[DESC_SIZE + 1];
    int   start_pos, end_pos, db_index, status, index, context = 0;
    enum  {para, dash, hex, equal, line, whole, field} type = para;
    int   dash_len = 0, ind, minimum_word;
    int   hex_value, field_pos = 1, field_size;
    short leng;
    char  *dashes = NULL;
    struct FAB idxfab, selfab;
    struct RAB idxrab, selrab;
    struct XABKEY idxxab, selxab;
    topic_str topics[TOPIC_SIZE];
    $DESCRIPTOR(input_dsc, cli_input);
    $DESCRIPTOR(file_dsc, file_arg);
    $DESCRIPTOR(file_spec_dsc, file_spec);
    $DESCRIPTOR(out_dsc, out_name);
    $DESCRIPTOR(punct_dsc, temp_punct);
    $DESCRIPTOR(value_dsc, value);

    status = lib$get_foreign(&input_dsc, 0, &leng, 0);

    for (ind = leng; ind >= 0; ind--)
        cli_input[ind+6] = cli_input[ind];
    strncpy(cli_input, "index ", 6);
    input_dsc.dsc$w_length = leng+6;

    status = cli$dcl_parse(&input_dsc, index_commands, lib$get_input);

    if (status != CLI$_NORMAL)  /* error in parse, exit */
        exit(7);

    if ((cli$present(descr("file")) & 1) == 0) {
        printf("Usage: index document\n");
        printf(" /TOPIC=(text,position,size,exclude)  specify topic names\n");
        printf(" /WORD_LENGTH=n      maximum size of index key (default 20)\n");
        printf(" /MAX_TOPICS=n       maximum size of topic ID field (default 6)\n");
        printf(" /PARAGRAPH          text separated by blank lines\n");
        printf(" /FF                 text separated by form feeds\n");
        printf(" /DASH=n             text separated n dashes (default 3)\n");
        printf(" /EQUAL=n            text separated n equals (default 80)\n");
        printf(" /CHARACTER=n        text separated by control character 'n'\n");
        printf(" /LINE               each line is separate text entry\n");
        printf(" /WHOLE              whole file is one text entry\n");
        printf(" /FIELD=(position,size)  specify topic break on field\n");
        printf(" /OUTPUT=file        override name of index/selection files\n");
        printf(" /[NO]VERSION        keep [discard] document version in selection\n");
        printf(" /[NO]DEFAULT_TOPIC  keep [discard] topics [not] matched by /TOPIC\n");
        printf(" /LINK               generate .link file instead of .idx,.sel files\n");
        printf(" /NOISE=file         specify a file of words to omit in the index\n");
        printf(" /PUNCTUATION=\"...\"  specify the characters that separate words\n");
        printf(" /MINIMUM_WORD=n     define minimum word to index (default 3)\n");
        printf(" /COUNT_WORDS        record count of duplicate words in index\n");
        exit(1);
    }

    if (cli$present(descr("paragraph")) & 1)
        type = para;
    if (cli$present(descr("FF")) & 1) {
        type = hex;                      /* /FF same as /character=12 */
        hex_value = '\f';
    }
    if (cli$present(descr("character")) & 1) {
        status = cli$get_value(descr("character"), &value_dsc, &leng);
        type = hex;
        value[leng] = '\0';
        hex_value = atoi(value);
    }
    if (cli$present(descr("whole")) & 1)
        type = whole;
    if (cli$present(descr("line")) & 1)
        type = line;
    if (cli$present(descr("dash")) & 1) {
        status = cli$get_value(descr("dash"), &value_dsc, &leng);
        type = dash;
        value[leng] = '\0';
        dash_len = atoi(value);
    }
    if (cli$present(descr("equal")) & 1) {
        status = cli$get_value(descr("equal"), &value_dsc, &leng);
        type = equal;
        value[leng] = '\0';
        dash_len = atoi(value);
    }
    if ((status = cli$present(descr("word_length"))) & 1) {
        status = cli$get_value(descr("word_length"), &value_dsc, &leng);
        value[leng] = '\0';
        max_word = atoi(value);
    }
    if (cli$present(descr("field")) & 1) {
        type = field;
        status = cli$get_value(descr("field.position"), &value_dsc, &leng);
        value[leng] = '\0';
        if (status & 1)
            field_pos = atoi(value);
        status = cli$get_value(descr("field.size"), &value_dsc, &leng);
        value[leng] = '\0';
        field_size = atoi(value);
    }
    strcpy(punctuation, PUNCT_CHARS);  /* default for /punctuation */
    if (cli$present(descr("punctuation")) & 1) {
        status = cli$get_value(descr("punctuation"), &punct_dsc, &leng);
        temp_punct[leng] = '\0';
        if (temp_punct[0] == '"')  /* if quoted string */
            strncpy(punctuation, temp_punct+1, leng-2);
        else if (strlen(temp_punct) > 0)
            strcpy(punctuation, temp_punct);
    }
    else {  /*  /nopunctuation="$" means exclude $ from punct chars */
        status = cli$get_value(descr("punctuation"), &punct_dsc, &leng);
        temp_punct[leng] = '\0';
        if (temp_punct[0] == '"') {   /* if quoted string */
            strcpy(temp_punct, temp_punct+1);
            temp_punct[leng-2] = '\0';
        }
        for (cp = temp_punct; *cp; cp++) {
            cp2 = strchr(punctuation, *cp);
            if (cp2)
                strcpy(cp2, cp2+1);  /* remove character from punctuation */
        }
    }
    if ((status = cli$present(descr("max_topics"))) & 1) {
        status = cli$get_value(descr("max_topics"), &value_dsc, &leng);
        value[leng] = '\0';
        max_topic = max_count = atoi(value);  /* query assumes topic = count */
        if (max_topic > 9) {
            printf("/MAX_TOPICS specifies the number of digits in the topic number field.\n");
            printf("A 32 bit system cannot handle integers greater than 9 digits.\n");
            exit(9);
        }
    }
    if ((status = cli$present(descr("minimum_word"))) & 1) {
        status = cli$get_value(descr("minimum_word"), &value_dsc, &leng);
        value[leng] = '\0';
        minimum_word = atoi(value);
    }
    count_words = (cli$present(descr("count_words")) & 1);

    for (index = 0; index < TOPIC_SIZE; index++) {
        topics[index].text = NULL;
        topics[index].found = NULL;
        topics[index].pos = 0;
        topics[index].size = 0;
        topics[index].exclude = 0;
    }
    parse_topic(cli_input, topics); /* parse the command line and fill topics */

    status = cli$get_value(descr("file"), &file_dsc, &leng);  /* get source */
    file_dsc.dsc$w_length = leng;  /* set the descriptor length */

    strncpy(file_spec, "", sizeof(file_spec));  /* clear out file_spec */
    status = lib$find_file(&file_dsc, &file_spec_dsc, &context, 0, 0, 0, 0);
    if ((status & 1) == 0) {
        perror("lib$find_file failed");
        exit(11);
    }
    ptr = strchr(file_spec, ' ');
    if (ptr)
        *ptr = '\0';            /* chop off trailing spaces */

    strcpy(out_name, file_spec);    /* make copy for output spec */

    if (cli$present(descr("output")) & 1) { /* if /output, overwrite out_name */
        status = cli$get_value(descr("output"), &out_dsc, &leng);
        out_name[leng] = '\0';
    }

    words_size = words_index = 0;    /* no words yet */
    words = NULL;
    noise_size = noise_index = 0;    /* no noise yet */
    noise = NULL;
    if ((cli$present(descr("link")) & 1) == 0)
        load_noise(punctuation);  /* build a list of words to ignore */
    db_index = 0;

    dashes = (char *)malloc(dash_len+1);
    memset((void *)dashes, (type==dash)?'-':'=', dash_len);
    dashes[dash_len] = '\0';

    memset((void *) spaces, ' ', DESC_SIZE); /* make spaces for padding topic */
    spaces[DESC_SIZE] = '\0';

    keyword = (char *) calloc(max_word + max_count + 1, sizeof(char));
    idx_record = (char *) calloc(max_word + max_count + max_topic + 1,
                                 sizeof(char));

    ptr = strrchr(out_name, '.');  /* just get file name */
    if (ptr) *ptr = '\0';
    strcat(out_name, ".idx");

    idxfab = cc$rms_fab;
    idxfab.fab$l_alq = 50;
    idxfab.fab$b_bks = 3;
    idxfab.fab$w_deq = 10;
    idxfab.fab$b_fac = FAB$M_GET | FAB$M_PUT | FAB$M_UPD;
    idxfab.fab$l_fna = out_name;
    idxfab.fab$b_fns = strlen(out_name);
    idxfab.fab$l_fop = FAB$M_CBT | FAB$M_DFW;
    idxfab.fab$w_mrs = max_word + max_topic + max_count;
    idxfab.fab$b_org = FAB$C_IDX;
    idxfab.fab$b_rat = FAB$M_CR;
    idxfab.fab$b_rfm = FAB$C_FIX;
    idxfab.fab$b_shr = FAB$M_NIL;
    idxfab.fab$l_xab = (char *) &idxxab;

    idxrab = cc$rms_rab;
    idxrab.rab$l_fab = (struct FAB *) &idxfab;
    idxrab.rab$b_krf = 0;
    idxrab.rab$l_kbf = keyword;
    idxrab.rab$b_ksz = max_word + max_topic;
    idxrab.rab$b_rac = RAB$C_KEY;
    idxrab.rab$l_rbf = idx_record;
    idxrab.rab$w_rsz = max_word + max_topic + max_count;
    idxrab.rab$l_ubf = idx_record;
    idxrab.rab$w_usz = max_word + max_topic + max_count;
    idxrab.rab$b_mbf = 20;
    idxrab.rab$l_rop = RAB$M_RAH | RAB$M_WBH;

    idxxab = cc$rms_xabkey;
    idxxab.xab$b_dtp = XAB$C_STG;
    idxxab.xab$b_flg = XAB$M_DAT_NCMPR | XAB$M_IDX_NCMPR;
    idxxab.xab$w_pos0 = 0;
    idxxab.xab$b_siz0 = max_word + max_topic;
    idxxab.xab$b_ref = 0;

    if ((cli$present(descr("link")) & 1) == 0) {
        if (((status = sys$create(&idxfab)) & 1) != SS$_NORMAL)
            lib$stop(status);
        if (((status = sys$connect(&idxrab)) & 1) != SS$_NORMAL)
            lib$stop(status);
    }

    ptr = strrchr(out_name, '.');  /* just get file name */
    if (ptr) *ptr = '\0';
    strcat(out_name, ".sel");

    selfab = cc$rms_fab;
    selfab.fab$l_alq = 10;
    selfab.fab$b_bks = 3;
    selfab.fab$w_deq = 5;
    selfab.fab$b_fac = FAB$M_PUT;
    selfab.fab$l_fna = out_name;
    selfab.fab$b_fns = strlen(out_name);
    selfab.fab$l_fop = FAB$M_CBT | FAB$M_DFW;
    selfab.fab$w_mrs = max_topic + DESC_SIZE + SELECTOR_SIZE;
    selfab.fab$b_org = FAB$C_IDX;
    selfab.fab$b_rat = FAB$M_CR;
    selfab.fab$b_rfm = FAB$C_VAR;
    selfab.fab$b_shr = FAB$M_NIL;
    selfab.fab$l_xab = (char *) &selxab;

    selrab = cc$rms_rab;
    selrab.rab$l_fab = (struct FAB *) &selfab;
    selrab.rab$b_rac = RAB$C_KEY;
    selrab.rab$l_rbf = sel_record;
    selrab.rab$b_mbf = 20;
    selrab.rab$l_rop = RAB$M_RAH | RAB$M_WBH;

    selxab = cc$rms_xabkey;
    selxab.xab$b_dtp = XAB$C_STG;
    selxab.xab$b_flg = XAB$M_DAT_NCMPR | XAB$M_IDX_NCMPR;
    selxab.xab$w_pos0 = 0;
    selxab.xab$b_siz0 = max_topic;
    selxab.xab$b_ref = 0;

    if ((cli$present(descr("link")) & 1) == 0) {
        if (((status = sys$create(&selfab)) & 1) != SS$_NORMAL)
            lib$stop(status);
        if (((status = sys$connect(&selrab)) & 1) != SS$_NORMAL)
            lib$stop(status);
    }

    if (cli$present(descr("link")) & 1) {
        ptr = strrchr(out_name, '.');
        if (ptr) *ptr = '\0';
        strcat(out_name, ".link");
        lnk = fopen(out_name, "w", "mbc=50", "mbf=20");
        fprintf(lnk, "Sortdir=False\n\n");
    }

    for (;;) {  /* process all files in input spec, first one already found */

        if ((src = fopen(file_spec, "r", "mbc=50", "mbf=20")) == NULL) { 
            printf("Can't read input file %s\n", file_spec);
            exit(3);
        }
        printf("Building index for %s\n", file_spec);

        start_pos = ftell(src);          /* init start position */
        strncpy(desc, "", DESC_SIZE + 1);
        
        while (fgets(src_line, sizeof(src_line), src)) {
            if ((src_line[0] == hex_value) && (type == hex)) {
                write_words(src, lnk, &selrab, &idxrab, &db_index, &start_pos,
                            &end_pos, desc, topics);
                continue;
            }
            ptr = strchr(src_line, '\n');
            if (ptr) *ptr = '\0';              /* remove newline */
            for (ptr = src_line; *ptr; ptr++)
                if (iscntrl(*ptr))  *ptr = ' ';  /* convert tabs to spaces */
            while ((strlen(src_line) > 0) &&
                   (src_line[strlen(src_line)-1] == ' '))
                src_line[strlen(src_line)-1] = '\0';/* remove trailing blanks */
            strcpy(orig_line, src_line);    /* copy before forcing lower case */
            for (ptr = src_line; *ptr; ptr++)
                *ptr = _tolower(*ptr);           /* force lowercase */
            for (ptr = src_line; *ptr; ptr++)
                if (*ptr > ' ') break;  /* find first non-blank char */
            strcpy(src_line, ptr);      /* remove leading blanks */
        
            if (((type == equal) || (type == dash)) &&
                (strncmp(orig_line, dashes, dash_len) == 0)) {
                write_words(src, lnk, &selrab, &idxrab, &db_index, &start_pos,
                            &end_pos, desc, topics);
                continue;
            }
            if ((type == para) && (strlen(src_line) == 0)) {
                write_words(src, lnk, &selrab, &idxrab, &db_index, &start_pos,
                            &end_pos, desc, topics);
                continue;
            }
            if ((type == field) && !is_spaces(orig_line, field_pos, field_size)) {
                write_words(src, lnk, &selrab, &idxrab, &db_index, &start_pos,
                            &end_pos, desc, topics);
                start_pos = end_pos;   /* don't skip over line with field break */
            }
            /* save the first line by default */ 
            if ((cli$present(descr("default_topic")) & 1) && (strlen(desc) == 0))
                strncpy(desc, orig_line, DESC_SIZE);

            for (index = 0; topics[index].pos; index++) /* apply topic rules */
                if ((topics[index].text && strncmp(src_line + topics[index].pos - 1,
                            topics[index].text, strlen(topics[index].text)) == 0) ||
                    (!topics[index].text &&
                     !is_spaces(orig_line, topics[index].pos, topics[index].size))) {
                    if (topics[index].exclude)
                        strcpy(orig_line + topics[index].pos - 1,
                               orig_line + topics[index].pos + strlen(topics[index].text) - 1);
                    topics[index].found = (char *) my_realloc((char *) topics[index].found,
                                         (topics[index].size ? topics[index].size : strlen(orig_line))
                                         + 1);
                    if (topics[index].size > 0) {
                        strncpy(topics[index].found, orig_line + topics[index].pos - 1,
                                topics[index].size);
                        topics[index].found[topics[index].size] = '\0';
                        strncat(topics[index].found, spaces,
                                topics[index].size - strlen(topics[index].found));
                    }
                    else
                        strcpy(topics[index].found, orig_line + topics[index].pos - 1);
                break;     /* a line satisfies only one topic rule */
                }
            if ((cli$present(descr("link")) & 1) == 0)
                build_words(src_line, punctuation, minimum_word);
            end_pos = ftell(src);  /* end_pos points before any terminator */
            if (type == line)
                write_words(src, lnk, &selrab, &idxrab, &db_index, &start_pos,
                            &end_pos, desc, topics);
        }
        
        /* in case file doesn't end with a terminator */
        write_words(src, lnk, &selrab, &idxrab, &db_index, &start_pos, &end_pos,
                    desc, topics);
        fclose(src);
        if (cli$present(descr("link")) & 1)
            fclose(lnk);
        status = lib$find_file(&file_dsc, &file_spec_dsc, &context, 0, 0, 0, 0);
        if ((status & 1) == 0) {
            lib$find_file_end(&context);
            break;
        }
        ptr = strchr(file_spec, ' ');
        if (ptr) *ptr = '\0';            /* chop off trailing spaces */
    }
    sys$close(&selfab);
    sys$close(&idxfab);
}


/* break line into words and save them in words[] */

void build_words(char *line, char *punct, int minimum_word)
{
    char *cp, *cp2;

    for (cp = line; *cp; cp++)      /* convert punctuation to spaces */
        if (is_punct(*cp, punct)) *cp = ' ';

    strcat(line, " ");              /* line ends with a space */
    cp = line;
    while(cp2 = strchr(cp, ' ')) {  /* break at space boundary */
        *cp2 = '\0';
        if (strlen(cp) > max_word)
            printf("Truncating %d character word (%s) to %d characters\n",
                   strlen(cp), cp, max_word);
        if ((strlen(cp) > 0) && !is_noise(cp, noise_size, minimum_word)) {
            if (words_index == words_size)  /* table full */
                expand_table(&words, &words_size);
            strncpy(words[words_index++], cp, max_word);
        }
        cp = cp2 + 1;
    }
}


/* expand *table[] by CHUNK elements of max_word characters */

void expand_table(char ***table, int *size)
{
    int ind;

    *table = (char **) my_realloc((char **) *table, (*size + CHUNK) * sizeof(char *));
    for (ind = 0; ind < CHUNK; ind++)
        (*table)[*size + ind] = (char *) calloc(max_word + 1, sizeof(char));
    *size += CHUNK;
}


int power(int base, int exp)
{
    int result;

    result = base;
    while (--exp)
        result *= base;
    return result;
}


/* write out Gopher command, write out words */

void write_words(FILE *src, FILE *lnk, struct RAB *selptr, struct RAB *idxptr,
                 int *db_index, int *start_pos, int *end_pos, char *desc,
                 topic_str *topics)
{
    int ind, status;
    char filename[256], count[20], *ptr, temp_desc[512] = "";

    if ((cli$present(descr("link")) & 1) == 0) {
        if (words_index == 0)
            return;      /* no words to write */
        (*db_index)++;
        if ((*db_index) == power(10, max_topic)) {
            printf("You have reached %d topics in this index\n", *db_index);
            printf("Please re-index with /MAX_TOPIC larger than %d\n", max_topic);
            exit(5);
        }
    }
    fgetname(src, filename);
    if ((cli$present(descr("version")) & 1) == 0) {  /* if /noversion */
        ptr = strchr(filename, ';');  /* get rid of version number */
        if (ptr) *ptr = '\0';
    }
    for(ptr = filename; *ptr; ptr++)
        *ptr = _tolower(*ptr);  /* force filename lowercase */
    for (ind = 0; ind < TOPIC_SIZE; ind++)
        if (topics[ind].found && (strlen(topics[ind].found) > 0)) {
            if (strlen(temp_desc) > 0)
                strcat(temp_desc, " ");
            strcat(temp_desc, topics[ind].found);
        }
    if (strlen(temp_desc) > 0)
        strncpy(desc, temp_desc, DESC_SIZE);

    if (strlen(desc) > 0) {  /* no description, no index */
        if ((cli$present(descr("link")) & 1) == 0) {
            /* write out the selector */
            sprintf(sel_record, "%0*d0%s\tR%d-%d-%s",
                    max_topic, *db_index, desc, *start_pos, *end_pos, filename);
            selptr->rab$w_rsz = strlen(sel_record);
            if (((status = sys$put(selptr)) & 1) != SS$_NORMAL)
                lib$stop(status);
        }
        else {
            fprintf(lnk, "Name=%s\nType=0\n", desc);
            fprintf(lnk, "Path=R%d-%d-%s\n", *start_pos, *end_pos, filename);
            fprintf(lnk, "Port=+\nHost=+\n\n");
        }

        if ((cli$present(descr("link")) & 1) == 0)
            /* write out the words */
            for (ind = 0; ind < words_index; ind++) {
                sprintf(keyword, "%-*s%0*d", max_word, words[ind],
                        max_topic, *db_index);
                sprintf(idx_record, "%-*s%0*d%0*d",
                        max_word, words[ind],
                        max_topic, *db_index,
                        max_count, 1);
                status = sys$put(idxptr);
                /* record exists, increment count */
                if ((status == RMS$_DUP) && count_words) {
                    status = sys$get(idxptr);
                    if ((status & 1) != SS$_NORMAL)
                        lib$stop(status);
                    strncpy(count, idx_record + max_word + max_topic, max_count);
                    count[max_count] = '\0';
                    sprintf(idx_record, "%-*s%0*d%0*d",
                            max_word, words[ind],
                            max_topic, *db_index,
                            max_count, atoi(count) + 1);
                    status = sys$update(idxptr);
                    if ((status & 1) != SS$_NORMAL)
                        lib$stop(status);
                }
                if ((status != RMS$_DUP) && ((status & 1) != SS$_NORMAL))
                    lib$stop(status);
                *words[ind] = '\0';
            }
        printf("%s\n", desc);
    }
    strncpy(desc, "", sizeof(desc));
    *start_pos = ftell(src);          /* init start position */
    words_index = 0;
    for (ind = 0; ind < TOPIC_SIZE; ind++)
        if (topics[ind].found)
            *topics[ind].found = '\0';
}


/* read in a file of noise words, one per line */

void load_noise(char *punct)
{
    FILE *nf;
    char *cp, *cp2, line[256];
    static char file_name[256];
    short leng;
    int status;
    $DESCRIPTOR(noise_dsc, file_name);

    if (cli$present(descr("noise")) & 1) { 
        status = cli$get_value(descr("noise"), &noise_dsc, &leng);
        file_name[leng] = '\0';
        if ((nf = fopen(file_name, "r")) == NULL) {
            printf("Can't read noise file %s\n", file_name);
            return;
        }
    }
    else if ((nf = fopen("_noise_words", "r", "dna = gopher_root:[000000].dat")) == NULL)
        return;

    while (fgets(line, sizeof(line), nf)) {
        cp = strchr(line, '\n');
        if (cp) *cp = '\0';               /* remove newline */
        for (cp = line; *cp; cp++) {
            if (is_punct(*cp, punct) || iscntrl(*cp))
                *cp = ' ';            /* convert punctuation, tabs to spaces */
            *cp = _tolower(*cp);          /* force lowercase */
        }
        while ((strlen(line) > 0) &&
               (line[strlen(line)-1] == ' '))
            line[strlen(line)-1] = '\0';  /* remove trailing blanks */
        for (cp = line; *cp; cp++)
            if (*cp > ' ') break;         /* find first non-blank char */
        strcpy(line, cp);                 /* remove leading blanks */

        strcat(line, " ");                /* line ends with a space */
        cp = line;
        while(cp2 = strchr(cp, ' ')) {    /* break at space boundary */
            *cp2 = '\0';
            if (strlen(cp) > 0) {
                if (noise_index == noise_size)  /* table full */
                    expand_table(&noise, &noise_size);
                strcpy(noise[noise_index++], cp);
            }
            cp = cp2 + 1;
        }
    }

    fclose(nf);
}

/* see if a char is punctuation */

int is_punct(char ch, char *punct)
{
    char *ptr;

    for (ptr = punct; *ptr; ptr++)
        if (*ptr == ch)
            return TRUE;
    return FALSE;
}


/* see if field is spaces */

int is_spaces(char *line, int pos, int size)
{
    int index;

    if (strlen(line) < pos)
        return(TRUE);
    for (index = 0; index < size; index++)
        if (!isspace(line[pos + index - 1]))
            return(FALSE);
    return(TRUE);
}


/* see if the word is noise */

int is_noise(char *word, int size, int minimum_word)
{
    int ind;

    if (strlen(word) < minimum_word)  /* simple heuristic saves lots of noise entries */
        return(TRUE);
    for(ind = 0; ind < size; ind++)
        if (strcmp(noise[ind], word) == 0)
            return (TRUE);
    return (FALSE);
}


/* descr() creates character descriptor and return the address
of the descriptor to the caller. */
# define N_DESCR 10
static struct dsc$descriptor_s str_desc[N_DESCR];
static int cur_descr = -1;

struct dsc$descriptor_s *descr(char *string)
{
    if(++cur_descr >= N_DESCR) cur_descr = 0;
    str_desc[cur_descr].dsc$w_length=(short)strlen(string);      
    str_desc[cur_descr].dsc$b_dtype=DSC$K_DTYPE_T;   
    str_desc[cur_descr].dsc$b_class=DSC$K_CLASS_S;  
    str_desc[cur_descr].dsc$a_pointer=string;     
    return (&str_desc[cur_descr]);
}


/* parse command line for /topic */
void parse_topic(char *line, topic_str *topics)
{
    char *ptr, *start;
    static int index = -1;

    for (ptr = line; *ptr; ptr++)
        *ptr = _tolower(*ptr);         /* force command line lowercase */
    ptr = line;                        /* point to start of line */
    for (;;) {                         /* search for /topic until end of line */
        if (index == TOPIC_SIZE)
            return;                    /* exit if we can't hold any more */
        ptr = strchr(ptr, '/');        /* search for switch start */
        if (ptr == NULL)
            return;                    /* no more switches */
        while (isspace(*++ptr));       /* skip spaces */
        if (*ptr != 't')               /* topic is unique to one character */
            continue;                  /* not /topic, keep scanning */
        do
            ptr++;
        while ((*ptr != '=')           /* skip to the keyword/parameter */
               && (*ptr != ':'));      /* separator character */
        while (isspace(*++ptr));       /* skip spaces */
        if (*ptr == '(')               /* if start of list */
            while (isspace(*++ptr));   /* skip spaces */
        index++;                       /* next topics structure */
        topics[index].pos = 1;         /* default the position to 1 */
        for (;;) {                     /* parse all /topic list elements */
            switch (*ptr) {
                case 't':                            /* text */
                    do 
                        ptr++;
                    while ((*ptr != '=')         /* skip to the keyword/parameter */
                           && (*ptr != ':'));    /* separator character */
                    while (isspace(*++ptr));     /* skip spaces */
                    if (*ptr == '"') {           /* if quoted string */
                        start = ++ptr;           /* skip over quote */
                        while (*ptr != '"')      /* skip to ending quote */
                            ptr++;
                    }
                    else {                       /* else non-quoted string */
                        start = ptr;             /* start of string */
                        while (*ptr
                               && (*ptr != ' ')
                               && (*ptr != ',')
                               && (*ptr != '/')
                               && (*ptr != ')'))
                            ptr++;               /* skip to string terminator */
                    }
                    topics[index].text = (char *) calloc((ptr - start) + 1, sizeof(char));
                    strncpy(topics[index].text, start, ptr - start);
                    break;
                case 'p':                        /* position */
                    do
                        ptr++;
                    while ((*ptr != '=')       /* skip to the keyword/parameter */
                           && (*ptr != ':'));  /* separator character */
                    while (isspace(*++ptr));   /* skip spaces */
                    topics[index].pos =       /* get (hopefully) decimal number */
                        (atoi(ptr) < 256) ? atoi(ptr) : 256;
                    break;
                case 's':                        /* size */
                    do
                        ptr++;
                    while ((*ptr != '=')       /* skip to the keyword/parameter */
                           && (*ptr != ':'));  /* separator character */
                    while (isspace(*++ptr));   /* skip spaces */
                    topics[index].size =       /* get (hopefully) decimal number */
                        (atoi(ptr) < DESC_SIZE) ? atoi(ptr) : DESC_SIZE;
                    break;
                case 'e':                         /* exclude */
                    topics[index].exclude = TRUE; /* has no parameters */
                    break;
            }
            while (*ptr
                   && (*ptr != ' ')              /* skip to end of */
                   && (*ptr != ',')              /* keyword */
                   && (*ptr != '/')              /* switch */
                   && (*ptr != ')'))             /* or parameter */
                ptr++;
            while (*ptr &&
                   ((*ptr <= ' ') ||             /* skip spaces, junk */
                    (*ptr == ',')))              /* list seperators */
                ptr++;
            if (*ptr == '\0')
                return;                          /* end of the line */
            if ((*ptr == ')') || (*ptr == '/'))
                break;                           /* end of the list */
        }                                        /* scan for more list elements */
    }
}


void *my_realloc(void *mem, int size)
{
    if (mem == (void *) 0)
        return ((void *) malloc(size));
    else
        return((void *) realloc(mem, size));
}

