/* index sequential files, producing .idx and .sel files */ /* Bruce Tanner - Cerritos College */ /* Version History: 1.0 05/10/93 Original attempt 2.0 06/20/93 Create indexed files directly, add keyword count field 2.1 07/08/93 Change the file name for NOISE_WORDS 2.2 07/08/93 Move the range end (end_pos) to before the terminator 2.2jlw 07/14/93 - JLW added length spec to dash, added additional topic divider keywords 2.3 07/19/93 Set multi-buffer, multi-block counts, read-ahead, write-behind and deferred write; noticeably increased performance 2.4 07/26/93 Removed index name, added CLI$ interface, added /TOPIC 2.4jlw 07/27/93 fixed version retention, which was broken 2.5 07/27/93 Selector strings forced to lowercase; use a good copy 2.6 07/29/93 revamp /TOPIC syntax to include text, size, exclude 2.7 07/30/93 make SIZE=n pad as well as truncate field width 2.8 08/03/93 take wildcard input file names, add /OUTPUT, /VERSION 2.9 08/05/93 JLW changed filename sizes from 80 to 256 characters 2.10 08/05/93 add check for max number of topics, reformat code 2.11 08/24/93 JLW added specific statuses for exit errors 2.12 10/01/93 add /NODEFAULT_TOPIC to omit topics that have no topic keyword 2.13 11/03/93 add /LINK to generate .link file instead of .idx/.sel 2.14 11/15/93 add /NOISE=file to specify the noise words file 2.15 11/17/93 add /TOPIC=(position), /FIELD=(position, size), /PUNCTUATION 2.16 11/18/93 fix illegal strcpy for AXP, add /MAX_TOPICS 2.17 11/21/93 make load_noise friendlier, add /NOPUNCTUATION support 2.18 11/27/93 add /MINIMUM_WORD, /COUNT_WORDS 2.19 11/30/93 fix broken /TOPIC 2.20 03/20/94 sort words, add /LINK=SORT, /SEQUENTIAL, remove /COUNT_WORDS 2.21 04/29/94 add /NONUMBERS 2.22 06/23/94 add /TOPIC=(offset) /TOPIC=(position=0) 2.23 06/24/94 add /TOPIC=(end) 2.24 06/27/94 add /CANDIDATE, /KEYWORD=(text,end,exclude) 2.24a 06/29/94 replaced VAXC-specific "#include foo" declarations with more portable "#include " (so DECC won't balk). 2.25 08/04/94 fix /TOPIC=END not matching 2.26 09/15/94 /KEYWORD=END=foo stopped at end of line if 'foo' wasn't found 2.27 09/27/94 change get_text() to return updated pointer to fix mangled text 3.0 09/29/94 redo parsing routines, add /SPECIFICATION, /TOPIC=BREAK 3.1 10/10/94 add /SELECTOR, don't index selector line 3.2 10/17/94 change /SELECTOR to /SELECTOR=(TEXT,END,BOTH) 3.3 11/04/94 add /KEYWORD=(offset), extend selector to include host/port 3.4 11/07/94 add /HELPFILE /SELECTOR=IGNORE 3.5 12/16/94 move close of link file for wildcards 3.6 01/02/95 program around selector.end default problem */ #include #include #include #include #include #include #include #include #include #include #define CHUNK 100 /* increment to expand table of words */ #define DESC_SIZE 500 /* maximum size of a topic description */ #define SELECTOR_SIZE 200 /* maximum size of a selector (minus description) */ #define TOPIC_SIZE 20 /* maximum number of topics to list */ #define PUNCT_CHARS "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" #define DEFAULT_POS 0 /* column to look for topic by default; 0 = anywhere */ #define MAX_INPUT_LINE 500 typedef struct { char *text; char *found; char *end; int pos; int size; int offset; int exclude; int force_break; int used; } topic_str; typedef struct { char *name; int state; char *value; } switch_str; typedef struct dsc$descriptor_s dsc; int words_index, words_size; char **words; int noise_index, noise_size; int candidate_index, candidate_size; char **noise; char **candidate; int max_word, max_topic, max_count, sequential; char *idx_record, *prev_keyword; char sel_record[DESC_SIZE + SELECTOR_SIZE + 100]; int keyword_index; FILE *spc; switch_str switches[50]; topic_str topics[TOPIC_SIZE]; topic_str keywords[TOPIC_SIZE]; char *selector_spec; int cli$dcl_parse(); int cli$get_value(); int cli$present(); void build_words(char *, char *, int); void test_words(char *, char *, int, topic_str *); void expand_table(char ***, int *); void write_words(FILE *, FILE *, struct RAB *, struct RAB*, int *, int *, char *, topic_str *); void load_words(char *, char *, char ***, int *, int *); int is_noise(char *, int, int); int is_candidate(char *, int); int is_punct(char, char *); int is_spaces(char *, int, int); dsc *descr(char *string); void parse_topic(char *, topic_str *); void parse_keyword(char *, topic_str *); void *my_realloc(void *, int); void index_commands(); int find_str(char *, char *); void parse_commands(dsc *, switch_str[]); int switch_present(char *); char *switch_value(char *); main(int argc, char *argv[]) { FILE *src, *lnk; char *cp, *cp2, *ptr, desc[DESC_SIZE + 1], src_line[MAX_INPUT_LINE]; static char cli_input[256], punctuation[128], temp_punct[128]; static char value[20], file_arg[256], file_spec[256], out_name[256]; static char spec_name[256], spec_line[270]; char orig_line[MAX_INPUT_LINE], lc_line[MAX_INPUT_LINE]; char spaces[DESC_SIZE + 1], help_index[10]; int start_pos, end_pos, status, index, context = 0; enum {none, para, dash, hex, equal, line, whole, field} type = none; int dash_len = 0, ind, minimum_word, where; int hex_value, field_pos = 1, field_size; short leng; char *dashes = NULL; struct FAB idxfab, selfab; struct RAB idxrab, selrab; struct XABKEY idxxab, selxab; $DESCRIPTOR(input_dsc, cli_input); $DESCRIPTOR(file_dsc, file_arg); $DESCRIPTOR(file_spec_dsc, file_spec); $DESCRIPTOR(out_dsc, out_name); $DESCRIPTOR(punct_dsc, temp_punct); $DESCRIPTOR(value_dsc, value); $DESCRIPTOR(spec_dsc, spec_name); $DESCRIPTOR(spec_line_dsc, spec_line); for (index = 0; index < TOPIC_SIZE; index++) { topics[index].text = NULL; topics[index].end = NULL; topics[index].found = NULL; topics[index].pos = 0; topics[index].size = 0; topics[index].offset = 0; topics[index].exclude = 0; topics[index].used = 0; topics[index].force_break = 0; keywords[index].text = NULL; keywords[index].end = NULL; keywords[index].found = NULL; keywords[index].pos = 0; keywords[index].size = 0; keywords[index].offset = 0; keywords[index].exclude = 0; keywords[index].used = 0; keywords[index].force_break = 0; } status = lib$get_foreign(&input_dsc, 0, &leng, 0); for (ind = leng; ind >= 0; ind--) cli_input[ind+6] = cli_input[ind]; strncpy(cli_input, "index ", 6); input_dsc.dsc$w_length = leng+6; status = cli$dcl_parse(&input_dsc, index_commands); if (status != CLI$_NORMAL) /* error in parse, exit */ exit(7); if ((cli$present(descr("file")) & 1) == 0) { printf("Usage: index document\n"); printf(" /CANDIDATES=file specify a file of words for index candidates\n"); printf(" /CHARACTER=n text separated by control character 'n'\n"); printf(" /DASH=n text separated n dashes (default 3)\n"); printf(" /[NO]DEFAULT_TOPIC keep [discard] topics [not] matched by /TOPIC\n"); printf(" /EQUAL=n text separated n equals (default 80)\n"); printf(" /FF text separated by form feeds\n"); printf(" /FIELD=(position,size) specify topic break on field\n"); printf(" /HELPFILE=(selector,title) file to match query \"?\"\n"); printf(" /KEYWORD=(text,end,offset,exclude) specify indexing range\n"); printf(" /LINE each line is separate text entry\n"); printf(" /LINK[=SORT] generate .link file instead of .idx,.sel files\n"); printf(" /MAX_TOPICS=n maximum size of topic ID field (default 4)\n"); printf(" /MINIMUM_WORD=n define minimum word to index (default 3)\n"); printf(" /NOISE=file specify a file of words to omit in the index\n"); printf(" /NONUMBERS omit all numbers from the index\n"); printf(" /OUTPUT=file override name of index/selection files\n"); printf(" /PARAGRAPH text separated by blank lines\n"); printf(" /PUNCTUATION=\"...\" specify the characters that separate words\n"); printf(" /SELECTOR=(text,end,both,ignore) specify selectors to generate\n"); printf(" /SEQUENTIAL create sequential files (.seqidx, .seqsel)\n"); printf(" /SPECIFICATION=file specify a file of switches\n"); printf(" /TOPIC=(text,end,position,size,offset,exclude,break) specify topic names\n"); printf(" /[NO]VERSION keep [discard] document version in selection\n"); printf(" /WHOLE whole file is one text entry\n"); printf(" /WORD_LENGTH=n maximum size of index key (default 20)\n"); exit(1); } if (cli$present(descr("specification")) & 1) { status = cli$get_value(descr("specification"), &spec_dsc, &leng); spec_name[leng] = '\0'; if ((spc = fopen(spec_name, "r")) == NULL) { printf("Can't read spec file %s\n", spec_name); exit(13); } /* parse every line of the spec file */ while (fgets(spec_line, sizeof(spec_line), spc)) { if ((spec_line[0] == '\n') || (spec_line[0] == '#') || (spec_line[0] == '!')) continue; /* skip blank and comment lines */ ptr = strchr(spec_line, '\n'); if (ptr) *ptr = '\0'; leng = strlen(spec_line); for (ind = leng; ind >= 0; ind--) spec_line[ind+6] = spec_line[ind]; strncpy(spec_line, "index ", 6); spec_line_dsc.dsc$w_length = leng+6; parse_commands(&spec_line_dsc, switches); } } parse_commands(&input_dsc, switches); if (switch_present("paragraph")) type = para; if (switch_present("ff")) { type = hex; /* /FF same as /character=12 */ hex_value = '\f'; } if (switch_present("character")) { hex_value = atoi(switch_value("character")); type = hex; } if (switch_present("whole")) type = whole; if (switch_present("line")) type = line; if (switch_present("dash")) { dash_len = atoi(switch_value("dash")); type = dash; } if (switch_present("equal")) { dash_len = atoi(switch_value("equal")); type = equal; } if (switch_present("word_length")) { max_word = atoi(switch_value("word_length")); } if (switch_present("field")) { type = field; field_pos = atoi(switch_value("field.position")); field_size = atoi(switch_value("field.size")); } strcpy(punctuation, PUNCT_CHARS); /* default for /punctuation */ if (switch_present("punctuation")) { strcpy(temp_punct, switch_value("punctuation")); if (temp_punct[0] == '"') { /* if quoted string */ strncpy(punctuation, temp_punct+1, leng-2); punctuation[leng-2] = '\0'; } else if (strlen(temp_punct) > 0) strcpy(punctuation, temp_punct); } else { /* /nopunctuation="$" means exclude $ from punct chars */ if (cp = switch_value("punctuation")) strcpy(temp_punct, cp); else strcpy(temp_punct, ""); if (temp_punct[0] == '"') { /* if quoted string */ strcpy(temp_punct, temp_punct+1); temp_punct[leng-2] = '\0'; } for (cp = temp_punct; *cp; cp++) { cp2 = strchr(punctuation, *cp); if (cp2) strcpy(cp2, cp2+1); /* remove character from punctuation */ } } if (switch_present("max_topics")) { max_topic = max_count = atoi(switch_value("max_topics")); /* query assumes topic = count */ if (max_topic > 9) { printf("/MAX_TOPICS specifies the number of digits in the topic number field.\n"); printf("A 32 bit system cannot handle integers greater than 9 digits.\n"); exit(9); } } if (switch_present("minimum_word")) minimum_word = atoi(switch_value("minimum_word")); sequential = (switch_present("sequential")); strcpy(file_arg, switch_value("file")); /* get source */ file_dsc.dsc$w_length = (short) strlen(file_arg); /* set the descriptor length */ strncpy(file_spec, "", sizeof(file_spec)); /* clear out file_spec */ status = lib$find_file(&file_dsc, &file_spec_dsc, &context, 0, 0, 0, 0); if ((status & 1) == 0) { perror("lib$find_file failed"); exit(11); } ptr = strchr(file_spec, ' '); if (ptr) *ptr = '\0'; /* chop off trailing spaces */ strcpy(out_name, file_spec); /* make copy for output spec */ if (switch_present("output")) /* if /output, overwrite out_name */ strcpy(out_name, switch_value("output")); words_size = words_index = 0; /* no words yet */ words = NULL; noise_size = noise_index = 0; /* no noise yet */ noise = NULL; candidate_size = candidate_index = 0; /* no candidate yet */ candidate = NULL; if (!switch_present("link")) { load_words("noise", punctuation, &noise, &noise_size, &noise_index); load_words("candidates", punctuation, &candidate, &candidate_size, &candidate_index); } dashes = (char *)malloc(dash_len+1); memset((void *)dashes, (type==dash) ? '-' : '=', dash_len); dashes[dash_len] = '\0'; memset((void *) spaces, ' ', DESC_SIZE); /* make spaces for padding topic */ spaces[DESC_SIZE] = '\0'; idx_record = (char *) calloc(max_word + max_count + max_topic + 1, sizeof(char)); prev_keyword = (char *) calloc(max_word + 1, sizeof(char)); ptr = strrchr(out_name, '.'); /* just get file name */ if (ptr) *ptr = '\0'; if (sequential) strcat(out_name, ".seqidx"); else strcat(out_name, ".idx"); idxfab = cc$rms_fab; idxfab.fab$l_alq = 100; idxfab.fab$b_bks = 3; idxfab.fab$w_deq = 25; idxfab.fab$b_fac = FAB$M_PUT; idxfab.fab$l_fna = out_name; idxfab.fab$b_fns = strlen(out_name); idxfab.fab$l_fop = FAB$M_CBT | FAB$M_DFW; idxfab.fab$w_mrs = max_word + max_topic + max_count; if (sequential) idxfab.fab$b_org = FAB$C_SEQ; else idxfab.fab$b_org = FAB$C_IDX; idxfab.fab$b_rat = FAB$M_CR; idxfab.fab$b_rfm = FAB$C_FIX; idxfab.fab$b_shr = FAB$M_NIL; idxfab.fab$l_xab = (char *) &idxxab; idxrab = cc$rms_rab; idxrab.rab$l_fab = (struct FAB *) &idxfab; idxrab.rab$b_krf = 0; if (sequential) idxrab.rab$b_rac = RAB$C_SEQ; else idxrab.rab$b_rac = RAB$C_KEY; idxrab.rab$l_rbf = idx_record; idxrab.rab$w_rsz = max_word + max_topic + max_count; idxrab.rab$l_ubf = idx_record; idxrab.rab$w_usz = max_word + max_topic + max_count; idxrab.rab$b_mbf = 20; idxrab.rab$l_rop = RAB$M_RAH | RAB$M_WBH; idxxab = cc$rms_xabkey; idxxab.xab$b_dtp = XAB$C_STG; idxxab.xab$b_flg = XAB$M_DAT_NCMPR | XAB$M_IDX_NCMPR; idxxab.xab$w_pos0 = 0; idxxab.xab$b_siz0 = max_word + max_topic; idxxab.xab$b_ref = 0; if (!switch_present("link")) { if (((status = sys$create(&idxfab)) & 1) != SS$_NORMAL) lib$stop(status); if (((status = sys$connect(&idxrab)) & 1) != SS$_NORMAL) lib$stop(status); } ptr = strrchr(out_name, '.'); /* just get file name */ if (ptr) *ptr = '\0'; if (sequential) strcat(out_name, ".seqsel"); else strcat(out_name, ".sel"); selfab = cc$rms_fab; selfab.fab$l_alq = 10; selfab.fab$b_bks = 3; selfab.fab$w_deq = 5; selfab.fab$b_fac = FAB$M_PUT; selfab.fab$l_fna = out_name; selfab.fab$b_fns = strlen(out_name); selfab.fab$l_fop = FAB$M_CBT | FAB$M_DFW; selfab.fab$w_mrs = max_topic + DESC_SIZE + SELECTOR_SIZE; if (sequential) selfab.fab$b_org = FAB$C_SEQ; else selfab.fab$b_org = FAB$C_IDX; selfab.fab$b_rat = FAB$M_CR; selfab.fab$b_rfm = FAB$C_VAR; selfab.fab$b_shr = FAB$M_NIL; selfab.fab$l_xab = (char *) &selxab; selrab = cc$rms_rab; selrab.rab$l_fab = (struct FAB *) &selfab; if (sequential) selrab.rab$b_rac = RAB$C_SEQ; else selrab.rab$b_rac = RAB$C_KEY; selrab.rab$l_rbf = sel_record; selrab.rab$b_mbf = 20; selrab.rab$l_rop = RAB$M_RAH | RAB$M_WBH; selxab = cc$rms_xabkey; selxab.xab$b_dtp = XAB$C_STG; selxab.xab$b_flg = XAB$M_DAT_NCMPR | XAB$M_IDX_NCMPR; selxab.xab$w_pos0 = 0; selxab.xab$b_siz0 = max_topic; selxab.xab$b_ref = 0; if (!switch_present("link")) { if (((status = sys$create(&selfab)) & 1) != SS$_NORMAL) lib$stop(status); if (((status = sys$connect(&selrab)) & 1) != SS$_NORMAL) lib$stop(status); if (switch_present("helpfile.selector")) { selector_spec = (char *) calloc((strlen(switch_value("helpfile.selector")) + 1), sizeof(char)); strcpy(selector_spec, switch_value("helpfile.selector")); if (switch_present("helpfile.title")) strcpy(desc, switch_value("helpfile.title")); else strcpy(desc, "Help on search commands"); strcpy(help_index, "?"); build_words(help_index, "", 0); /* add "?" to the index */ strcpy(help_index, "?help"); build_words(help_index, "", 0); /* add "?help" to the index */ write_words(src, lnk, &selrab, &idxrab, &start_pos, &end_pos, desc, topics); /* write helpfile */ } } else { /* /link */ ptr = strrchr(out_name, '.'); if (ptr) *ptr = '\0'; strcat(out_name, ".link"); lnk = fopen(out_name, "w", "mbc=50", "mbf=20"); if (!switch_present("link.sort")) fprintf(lnk, "Sortdir=False\n\n"); } for (;;) { /* process all files in input spec, first one already found */ if ((src = fopen(file_spec, "r", "mbc=50", "mbf=20")) == NULL) { printf("Can't read input file %s\n", file_spec); exit(3); } printf("Building index for %s\n", file_spec); start_pos = ftell(src); /* init start position */ strncpy(desc, "", DESC_SIZE + 1); while (fgets(src_line, sizeof(src_line), src)) { /* if the first character of the line is the hex value, end topic */ if ((src_line[0] == hex_value) && (type == hex)) { write_words(src, lnk, &selrab, &idxrab, &start_pos, &end_pos, desc, topics); continue; } ptr = strchr(src_line, '\n'); if (ptr) *ptr = '\0'; /* remove newline */ for (ptr = src_line; *ptr; ptr++) if (iscntrl(*ptr)) *ptr = ' '; /* convert tabs to spaces */ while ((strlen(src_line) > 0) && (src_line[strlen(src_line)-1] == ' ')) src_line[strlen(src_line)-1] = '\0';/* remove trailing blanks */ strcpy(orig_line, src_line); /* copy before forcing lower case */ for (ptr = src_line; *ptr; ptr++) *ptr = _tolower(*ptr); /* force lowercase */ strcpy(lc_line, src_line); /* copy with leading blanks */ for (ptr = src_line; *ptr; ptr++) if (*ptr > ' ') break; /* find first non-blank char */ strcpy(src_line, ptr); /* remove leading blanks */ /* break on dashes */ if (((type == equal) || (type == dash)) && (strncmp(orig_line, dashes, dash_len) == 0)) { write_words(src, lnk, &selrab, &idxrab, &start_pos, &end_pos, desc, topics); continue; } /* break on paragraph */ if ((type == para) && (strlen(src_line) == 0)) { write_words(src, lnk, &selrab, &idxrab, &start_pos, &end_pos, desc, topics); continue; } /* break on non-empty field */ if ((type == field) && !is_spaces(orig_line, field_pos, field_size)) { write_words(src, lnk, &selrab, &idxrab, &start_pos, &end_pos, desc, topics); start_pos = end_pos; /* don't skip over line with field break */ } /* save the first line by default */ if ((switch_present("default_topic")) && (strlen(desc) == 0)) strncpy(desc, orig_line, DESC_SIZE); if (switch_present("selector.text") && (where = find_str(lc_line, switch_value("selector.text")))) { selector_spec = (char *) my_realloc((char *) selector_spec, strlen(orig_line) + 1); strcpy(selector_spec, orig_line + where - 1 + strlen(switch_value("selector.text"))); if (switch_present("selector.end") && /* if selector=end given */ (where = find_str(selector_spec, switch_value("selector.end")))) selector_spec[where - 1] = '\0'; /* mark selector end */ while (*selector_spec == ' ') /* remove leading spaces */ strcpy(selector_spec, selector_spec + 1); continue; /* do not index this line */ } else if (selector_spec == NULL) selector_spec = (char *) calloc(1, sizeof(char)); /* make empty spec */ for (index = 0; topics[index].used; index++) { /* apply topic rules */ where = topics[index].pos; /* where text is found */ /* if text matches the source text and position */ if ( ((topics[index].pos > 0) && topics[index].text && strncmp(lc_line + topics[index].pos - 1, topics[index].text, strlen(topics[index].text)) == 0) || /* or position = 0 and text is found _somewhere_ */ ((topics[index].pos == 0) && (where = find_str(lc_line, topics[index].text))) || /* or no text given but position and size field is non-blank */ (!topics[index].text && !is_spaces(orig_line, topics[index].pos, topics[index].size)) ) { /* if topic matches and requested a break, do it */ if (topics[index].force_break) { write_words(src, lnk, &selrab, &idxrab, &start_pos, &end_pos, desc, topics); start_pos = end_pos; /* don't skip over topic line */ } if (topics[index].exclude) strcpy(orig_line + where - 1, orig_line + where - 1 + strlen(topics[index].text)); topics[index].found = (char *) my_realloc((char *) topics[index].found, (topics[index].size ? topics[index].size : strlen(orig_line)) + 1); if (topics[index].size > 0) { strncpy(topics[index].found, orig_line + where - 1 + topics[index].offset, topics[index].size); topics[index].found[topics[index].size] = '\0'; strncat(topics[index].found, spaces, topics[index].size - strlen(topics[index].found)); } else { strcpy(topics[index].found, orig_line + where - 1 + topics[index].offset); if (where = find_str(topics[index].found, topics[index].end)) topics[index].found[where - 1] = '\0'; /* terminate the found string */ } break; /* a line satisfies only one topic rule */ } } if (!switch_present("link")) test_words(src_line, punctuation, minimum_word, keywords); end_pos = ftell(src); /* end_pos points before any terminator */ if (type == line) write_words(src, lnk, &selrab, &idxrab, &start_pos, &end_pos, desc, topics); } /* in case file doesn't end with a terminator */ write_words(src, lnk, &selrab, &idxrab, &start_pos, &end_pos, desc, topics); fclose(src); status = lib$find_file(&file_dsc, &file_spec_dsc, &context, 0, 0, 0, 0); if ((status & 1) == 0) { lib$find_file_end(&context); break; } ptr = strchr(file_spec, ' '); if (ptr) *ptr = '\0'; /* chop off trailing spaces */ } if (switch_present("link")) fclose(lnk); else { sys$close(&selfab); sys$close(&idxfab); } } /* test for start/end of keyword indexing */ void test_words(char *line, char *punct, int minimum_word, topic_str *keywords) { char test_line[MAX_INPUT_LINE], copy_line[MAX_INPUT_LINE]; int ind, where, retry; if (!switch_present("keyword")) { /* no /keyword= */ build_words(line, punct, minimum_word); /* index everything */ return; } if ((keyword_index > -1) && ((keywords[keyword_index].end == NULL) || (strlen(keywords[keyword_index].end) == 0))) keyword_index = -1; /* keyword indexing stops at EOL unless /keyword=end */ strcpy(test_line, line); /* copy source line */ do { where = 0; retry = FALSE; if (keyword_index == -1) { /* between keywords */ for (ind = 0; keywords[ind].used; ind++) if (where = find_str(test_line, keywords[ind].text)) break; if (where) { keyword_index = ind; /* record current keyword */ strcpy(test_line, test_line + where - 1 + keywords[ind].offset); /* remove up to keyword */ if (keywords[ind].exclude) strcpy(test_line, test_line + strlen(keywords[ind].text)); } } if (keyword_index > -1) /* in keyword index */ if (where = find_str(test_line, keywords[keyword_index].end)) { strcpy(copy_line, test_line); test_line[where - 1] = '\0'; build_words(test_line, punct, minimum_word); /* index contents of line */ strcpy(test_line, copy_line + where - 1); /* restart at end word */ keyword_index = -1; /* no longer indexing */ retry = TRUE; /* check for another keyword */ } else { /* indexing and no end word found */ build_words(test_line, punct, minimum_word); /* index contents of line */ return; } } while (retry); } /* break line into words and save them in words[] */ void build_words(char *line, char *punct, int minimum_word) { char *cp, *cp2; for (cp = line; *cp; cp++) /* convert punctuation to spaces */ if (is_punct(*cp, punct)) *cp = ' '; strcat(line, " "); /* line ends with a space */ cp = line; while(cp2 = strchr(cp, ' ')) { /* break at space boundary */ *cp2 = '\0'; if (strlen(cp) > max_word) printf("Truncating %d character word (%s) to %d characters\n", strlen(cp), cp, max_word); if ((strlen(cp) > 0) && (((candidate_size == 0) && !is_noise(cp, noise_size, minimum_word)) || (candidate_size && is_candidate(cp, candidate_size)))) { if (words_index == words_size) /* table full */ expand_table(&words, &words_size); strncpy(words[words_index++], cp, max_word); } cp = cp2 + 1; } } /* expand *table[] by CHUNK elements of max_word characters */ void expand_table(char ***table, int *size) { int ind; *table = (char **) my_realloc((char **) *table, (*size + CHUNK) * sizeof(char *)); for (ind = 0; ind < CHUNK; ind++) (*table)[*size + ind] = (char *) calloc(max_word + 1, sizeof(char)); *size += CHUNK; } int power(int base, int exp) { int result; result = base; while (--exp) result *= base; return result; } /* qsort compare routine */ int compare (const void *str1, const void *str2) { return(strcmp(*(char **) str1, *(char **) str2)); } /* write out the index entries */ write_index(int db_index, struct RAB *idxptr) { int ind, status, dup_count; /* write out the words */ /* sort keys for counts and $put performance */ qsort(words, words_index, sizeof(char *), compare); dup_count = 1; strcpy(prev_keyword, words[0]); for (ind = 1; ind < words_index; ind++) if (strcmp(words[ind], prev_keyword) == 0) dup_count++; else { sprintf(idx_record, "%-*s%0*d%0*d", max_word, prev_keyword, max_topic, db_index, max_count, dup_count); status = sys$put(idxptr); if ((status & 1) != SS$_NORMAL) lib$stop(status); strcpy(prev_keyword, words[ind]); dup_count = 1; } /* write out the last word */ sprintf(idx_record, "%-*s%0*d%0*d", max_word, prev_keyword, max_topic, db_index, max_count, dup_count); status = sys$put(idxptr); if ((status & 1) != SS$_NORMAL) lib$stop(status); } void reset_topic(FILE *src, char *desc, int *start_pos, topic_str *topics) { int ind; strncpy(desc, "", sizeof(desc)); *start_pos = ftell(src); /* init start position */ for (ind = 0; ind < words_index; ind++) /* clear out words[] */ *words[ind] = '\0'; words_index = 0; for (ind = 0; ind < TOPIC_SIZE; ind++) if (topics[ind].found) *topics[ind].found = '\0'; } /* write out Gopher command, write out words */ void write_words(FILE *src, FILE *lnk, struct RAB *selptr, struct RAB *idxptr, int *start_pos, int *end_pos, char *desc, topic_str *topics) { static int db_index = 0; int ind, status; char filename[256], *ptr, temp_desc[512] = ""; char hostname[100], portname[10]; keyword_index = -1; /* stop indexing at end of section */ if (!switch_present("link")) { if (words_index == 0) { reset_topic(src, desc, start_pos, topics); return; /* no words to write */ } if ((db_index + 2) >= power(10, max_topic)) { printf("You have reached %d topics in this index\n", db_index); printf("Please re-index with /MAX_TOPIC larger than %d\n", max_topic); exit(5); } } fgetname(src, filename); if (!switch_present("version")) { /* if /noversion */ ptr = strchr(filename, ';'); /* get rid of version number */ if (ptr) *ptr = '\0'; } for(ptr = filename; *ptr; ptr++) *ptr = _tolower(*ptr); /* force filename lowercase */ for (ind = 0; ind < TOPIC_SIZE; ind++) if (topics[ind].found && (strlen(topics[ind].found) > 0)) { if (strlen(temp_desc) > 0) strcat(temp_desc, " "); strcat(temp_desc, topics[ind].found); } if (strlen(temp_desc) > 0) strncpy(desc, temp_desc, DESC_SIZE); if (strlen(desc) > 0) { /* no description, no index */ if (switch_present("selector.text") || strlen(selector_spec)) { if (!switch_present("selector.both") && /* selectors only */ (strlen(selector_spec) == 0)) { /* and no selector found */ reset_topic(src, desc, start_pos, topics); /* reset topic stuff */ return; /* and quit */ } if (switch_present("selector.ignore") && /* should we ignore this? */ (strchr(switch_value("selector.ignore"), *selector_spec))) { reset_topic(src, desc, start_pos, topics); return; /* yes, ignore this */ } strncpy(hostname, "", sizeof(hostname)); /* init hostname */ strncpy(portname, "", sizeof(portname)); /* init portname */ if (ptr = strchr(selector_spec, '|')) { *ptr = '\0'; /* mark off selector from host */ sprintf(hostname, "\t%s", ptr + 1); /* copy host/port */ if (ptr = strchr(hostname, '|')) { *ptr = '\0'; /* mark off port from host */ sprintf(portname, "\t%s", ptr + 1); } } if (switch_present("link")) { fprintf(lnk, "Name=%s\nType=%c\n", desc, *selector_spec); fprintf(lnk, "Path=%s\n", selector_spec+1); fprintf(lnk, "Port=%s\nHost=%s\n\n", strlen(hostname) ? hostname + 1 : "+", strlen(portname) ? portname + 1 : "+"); } else { sprintf(sel_record, "%0*d%c%s\t%s%s%s", max_topic, ++db_index, *selector_spec, desc, selector_spec+1, hostname, portname); selptr->rab$w_rsz = strlen(sel_record); if (((status = sys$put(selptr)) & 1) != SS$_NORMAL) lib$stop(status); write_index(db_index, idxptr); } strcpy(selector_spec, ""); /* reset the current selector */ } if (!switch_present("selector.text") || switch_present("selector.both")) { if (!switch_present("link")) { /* write out the selector */ if (switch_present("whole")) /* whole file is a special case */ sprintf(sel_record, "%0*d0%s\t0%s", max_topic, ++db_index, desc, filename); else sprintf(sel_record, "%0*d0%s\tR%d-%d-%s", max_topic, ++db_index, desc, *start_pos, *end_pos, filename); selptr->rab$w_rsz = strlen(sel_record); if (((status = sys$put(selptr)) & 1) != SS$_NORMAL) lib$stop(status); write_index(db_index, idxptr); } else { fprintf(lnk, "Name=%s\nType=0\n", desc); fprintf(lnk, "Path=R%d-%d-%s\n", *start_pos, *end_pos, filename); fprintf(lnk, "Port=+\nHost=+\n\n"); } } printf("%s\n", desc); } reset_topic(src, desc, start_pos, topics); /* clear words[], topics */ } /* read in a file of words */ void load_words(char *name, char *punct, char ***table, int *table_size, int *table_index) { FILE *nf; char *cp, *cp2, line[MAX_INPUT_LINE]; static char file_name[256]; short leng; int status; $DESCRIPTOR(name_dsc, file_name); if (switch_present(name)) { strcpy(file_name, switch_value(name)); if ((nf = fopen(file_name, "r")) == NULL) { printf("Can't read data file %s\n", file_name); return; } } else if (strcmp(name, "noise")) return; else if ((nf = fopen("_noise_words", "r", "dna = gopher_root:[000000].dat")) == NULL) return; while (fgets(line, sizeof(line), nf)) { cp = strchr(line, '\n'); if (cp) *cp = '\0'; /* remove newline */ for (cp = line; *cp; cp++) { if (is_punct(*cp, punct) || iscntrl(*cp)) *cp = ' '; /* convert punctuation, tabs to spaces */ *cp = _tolower(*cp); /* force lowercase */ } while ((strlen(line) > 0) && (line[strlen(line)-1] == ' ')) line[strlen(line)-1] = '\0'; /* remove trailing blanks */ for (cp = line; *cp; cp++) if (*cp > ' ') break; /* find first non-blank char */ strcpy(line, cp); /* remove leading blanks */ strcat(line, " "); /* line ends with a space */ cp = line; while(cp2 = strchr(cp, ' ')) { /* break at space boundary */ *cp2 = '\0'; if (strlen(cp) > 0) { if (*table_index == *table_size) /* table full */ expand_table(table, table_size); strcpy((*table)[(*table_index)++], cp); } cp = cp2 + 1; } } fclose(nf); } /* see if a char is punctuation */ int is_punct(char ch, char *punct) { char *ptr; for (ptr = punct; *ptr; ptr++) if (*ptr == ch) return TRUE; return FALSE; } /* see if field is spaces */ int is_spaces(char *line, int pos, int size) { int index; if (strlen(line) < pos) return(TRUE); for (index = 0; index < size; index++) if (!isspace(line[pos + index - 1])) return(FALSE); return(TRUE); } /* see if the word is noise */ int is_noise(char *word, int size, int minimum_word) { int ind; if (strlen(word) < minimum_word) /* simple heuristic saves lots of noise entries */ return(TRUE); if ((!switch_present("numbers")) && isdigit(*word)) return(TRUE); for(ind = 0; ind < size; ind++) { if (noise[ind] == NULL) return (FALSE); if (strcmp(noise[ind], word) == 0) return (TRUE); } return (FALSE); } /* see if the word is candidate */ int is_candidate(char *word, int size) { int ind; for(ind = 0; ind < size; ind++) { if (candidate[ind] == NULL) return (FALSE); if (strcmp(candidate[ind], word) == 0) return (TRUE); } return (FALSE); } char *lc(char *str) { # define N_STRING 4 static char strings[N_STRING][MAX_INPUT_LINE]; static int cur_string = -1; char *cp; if (++cur_string >= N_STRING) cur_string = 0; for (cp = strings[cur_string];; cp++) { *cp = tolower(*str++); if (*cp == '\0') break; } return strings[cur_string]; } /* find where the string starts (origin 1) in record */ int find_str(char *record, char *str) { char *cp, *lcr; if ((str == NULL) || (strlen(str) == 0)) return (0); /* zero means string not found */ lcr = lc(record); cp = strstr(lcr, lc(str)); if (cp == NULL) return (0); return (cp - lcr + 1); } /* descr() creates character descriptor and return the address of the descriptor to the caller. */ # define N_DESCR 10 static struct dsc$descriptor_s str_desc[N_DESCR]; static int cur_descr = -1; struct dsc$descriptor_s *descr(char *string) { if(++cur_descr >= N_DESCR) cur_descr = 0; str_desc[cur_descr].dsc$w_length=(short)strlen(string); str_desc[cur_descr].dsc$b_dtype=DSC$K_DTYPE_T; str_desc[cur_descr].dsc$b_class=DSC$K_CLASS_S; str_desc[cur_descr].dsc$a_pointer=string; return (&str_desc[cur_descr]); } int get_decimal(char *ptr) { do ptr++; while ((*ptr != '=') /* skip to the keyword/parameter */ && (*ptr != ':')); /* separator character */ while (isspace(*++ptr)); /* skip spaces */ return ((atoi(ptr) < 256) ? atoi(ptr) : 256); } char *get_text(char **dest, char *ptr) { char *start, *cp; do ptr++; while ((*ptr != '=') /* skip to the keyword/parameter */ && (*ptr != ':')); /* separator character */ while (isspace(*++ptr)); /* skip spaces */ if (*ptr == '"') { /* if quoted string */ start = ++ptr; /* skip over quote */ for (; *ptr; ptr++) { /* skip to ending quote */ if ((*ptr == '"') && (*(ptr+1) == '"')) { /* doubled quotes? */ ptr++; /* yes, skip it */ continue; } if (*ptr == '"') /* un-doubled quote? */ break; /* yes, stop here */ } } else { /* else non-quoted string */ start = ptr; /* start of string */ while (*ptr && (*ptr != ' ') && (*ptr != ',') && (*ptr != '/') && (*ptr != ')')) ptr++; /* skip to string terminator */ } *dest = (char *) calloc((ptr - start) + 1, sizeof(char)); strncpy(*dest, start, ptr - start); for (cp = *dest; *cp; *cp++) /* collapse doubled quotes to single */ if ((*cp == '"') && (*(cp+1) == '"')) strcpy(cp, cp+1); return (ptr); } /* parse command line for /topic */ void parse_topic(char *line, topic_str *topics) { char *ptr, *start, **dest; static int index = -1; for (ptr = line; *ptr; ptr++) *ptr = _tolower(*ptr); /* force command line lowercase */ ptr = line; /* point to start of line */ for (;;) { /* search for /topic until end of line */ if (index == TOPIC_SIZE) return; /* exit if we can't hold any more */ ptr = strchr(ptr, '/'); /* search for switch start */ if (ptr == NULL) return; /* no more switches */ while (isspace(*++ptr)); /* skip spaces */ if (*ptr != 't') /* topic is unique to one character */ continue; /* not /topic, keep scanning */ do ptr++; while ((*ptr != '=') /* skip to the keyword/parameter */ && (*ptr != ':')); /* separator character */ while (isspace(*++ptr)); /* skip spaces */ if (*ptr == '(') /* if start of list */ while (isspace(*++ptr)); /* skip spaces */ index++; /* next topics structure */ topics[index].used = TRUE; /* this topic index is used */ topics[index].pos = DEFAULT_POS; /* default position */ topics[index].end = ""); parse_switch("selector.both", sw, 0, 0); parse_switch("selector.ignore", sw, 0, 0); parse_switch("sequential", sw, 0, 0); parse_switch("version", sw, 1, 0); parse_switch("whole", sw, 0, 0); parse_switch("word_length", sw, 1, "20"); if (switch_present("keyword")) { keyword_index = -1; parse_keyword(input->dsc$a_pointer, keywords); /* fill keywords[] */ } parse_topic(input->dsc$a_pointer, topics); /* parse the command line and fill topics */ } int switch_present(char *name) { int ind; for (ind = 0; switches[ind].name; ind++) if (strcmp(name, switches[ind].name) == 0) break; if (switches[ind].name == NULL) return (FALSE); return (switches[ind].state); } char *switch_value(char *name) { int ind; for (ind = 0; switches[ind].name; ind++) if (strcmp(name, switches[ind].name) == 0) break; return (switches[ind].value); } void *my_realloc(void *mem, int size) { if (mem == (void *) 0) return ((void *) malloc(size)); else return((void *) realloc(mem, size)); } .