/* index qi data file */ /* Bruce Tanner - Cerritos College */ /* 1.0 1993/08/14 Start with build_index */ /* 1.1 1993/08/30 Make fopen failure more explicit */ /* 1.2 1993/09/04 Move soundex creation outside */ #include ssdef #include stdio #include string #include ctype #include rms #include descrip #include climsgdef #include assert #include "qi.h" char idx_record[IDX_RECORD_SIZE + 1]; char idx_key[IDX_KEY_SIZE + 1]; char dat_record[DAT_RECORD_SIZE + 1]; char dat_key[DAT_KEY_SIZE + 1]; int field_attrib[MAX_FIELD]; int mode = 0; #define CREATE 1 #define MERGE 2 void read_fields(char *); void index_words(char *, struct RAB *, struct RAB *); struct dsc$descriptor_s *descr(char *); void build_commands(); int lib$get_foreign(), lib$get_input(); main(int argc, char *argv[]) { FILE *src; char cli_input[256], file_arg[256], file_spec[256]; char idx_name[256], dat_name[256]; char *ptr, field[DATA_SIZE + 1]; char dat_copy[DAT_RECORD_SIZE + 1]; int status, context = 0, count = 0; short leng; struct FAB idxfab, datfab; struct RAB idxrab, datrab; struct XABKEY idxxab, datxab; $DESCRIPTOR(input_dsc, cli_input); $DESCRIPTOR(file_dsc, file_arg); $DESCRIPTOR(file_spec_dsc, file_spec); $DESCRIPTOR(idx_dsc, idx_name); status = lib$get_foreign(&input_dsc, 0, &leng, 0); strncpy(cli_input+6, cli_input, leng); strncpy(cli_input, "build ", 6); status = cli$dcl_parse(&input_dsc, build_commands, lib$get_input); if (status != CLI$_NORMAL) /* error in parse, exit */ exit(1); if ((cli$present(descr("file")) & 1) == 0) { printf("Usage: build data_file /data/create/merge/config=.../output=...\n"); exit(3); } status = cli$get_value(descr("file"), &file_dsc, &leng); /* get source */ status = lib$find_file(&file_dsc, &file_spec_dsc, &context, 0, 0, 0, 0); ptr = strchr(file_spec, ' '); if (ptr) *ptr = '\0'; /* chop off trailing spaces */ strcpy(idx_name, file_spec); /* make copy for output spec */ if (cli$present(descr("output")) & 1) { /* if /output, overwrite out_name */ status = cli$get_value(descr("output"), &idx_dsc, &leng); idx_name[leng] = '\0'; } if (cli$present(descr("create")) & 1) mode = CREATE; if (cli$present(descr("merge")) & 1) mode = MERGE; ptr = strrchr(idx_name, '.'); /* just get file name */ if (ptr) *ptr = '\0'; strcat(idx_name, ".INDEX"); idxfab = cc$rms_fab; idxfab.fab$b_bks = 6; idxfab.fab$b_fac = FAB$M_GET | FAB$M_PUT; idxfab.fab$l_fna = idx_name; idxfab.fab$b_fns = strlen(idx_name); idxfab.fab$l_fop = FAB$M_CBT | FAB$M_DFW; idxfab.fab$w_mrs = IDX_RECORD_SIZE; idxfab.fab$b_org = FAB$C_IDX; idxfab.fab$b_rat = FAB$M_CR; idxfab.fab$b_rfm = FAB$C_FIX; idxfab.fab$b_shr = FAB$M_NIL; idxfab.fab$l_xab = &idxxab; idxrab = cc$rms_rab; idxrab.rab$l_fab = &idxfab; idxrab.rab$b_krf = 0; idxrab.rab$l_kbf = idx_key; idxrab.rab$b_ksz = IDX_KEY_SIZE; idxrab.rab$b_rac = RAB$C_KEY; idxrab.rab$l_rbf = idx_record; idxrab.rab$w_rsz = IDX_RECORD_SIZE; idxrab.rab$l_ubf = idx_record; idxrab.rab$w_usz = IDX_RECORD_SIZE; idxrab.rab$b_mbf = 20; idxrab.rab$l_rop = RAB$M_RAH | RAB$M_WBH; idxxab = cc$rms_xabkey; idxxab.xab$b_dtp = XAB$C_STG; idxxab.xab$b_flg = XAB$M_IDX_NCMPR; idxxab.xab$w_pos0 = 0; idxxab.xab$b_siz0 = IDX_KEY_SIZE; idxxab.xab$b_ref = 0; strcpy(dat_name, idx_name); ptr = strrchr(dat_name, '.'); /* just get file name */ if (ptr) *ptr = '\0'; strcat(dat_name, ".DATA"); datfab = cc$rms_fab; datfab.fab$b_bks = 9; datfab.fab$b_fac = FAB$M_GET | FAB$M_PUT | FAB$M_UPD; datfab.fab$l_fna = dat_name; datfab.fab$b_fns = strlen(dat_name); datfab.fab$l_fop = FAB$M_CBT | FAB$M_DFW; datfab.fab$w_mrs = DAT_RECORD_SIZE; datfab.fab$b_org = FAB$C_IDX; datfab.fab$b_rat = FAB$M_CR; datfab.fab$b_rfm = FAB$C_VAR; datfab.fab$b_shr = FAB$M_NIL; datfab.fab$l_xab = &datxab; datrab = cc$rms_rab; datrab.rab$l_fab = &datfab; datrab.rab$b_krf = 0; datrab.rab$l_kbf = dat_key; datrab.rab$b_ksz = DAT_KEY_SIZE; datrab.rab$b_rac = RAB$C_KEY; datrab.rab$l_rbf = dat_record; datrab.rab$b_mbf = 20; datrab.rab$l_rop = RAB$M_RAH | RAB$M_WBH; datxab = cc$rms_xabkey; datxab.xab$b_dtp = XAB$C_STG; datxab.xab$b_flg = XAB$M_DAT_NCMPR | XAB$M_IDX_NCMPR; datxab.xab$w_pos0 = 0; datxab.xab$b_siz0 = DAT_KEY_SIZE; datxab.xab$b_ref = 0; /* open index file */ if (mode == CREATE) if (((status = sys$create(&idxfab)) & 1) != SS$_NORMAL) lib$stop(status); if (mode == MERGE) if (((status = sys$open(&idxfab)) & 1) != SS$_NORMAL) lib$stop(status); if (((status = sys$connect(&idxrab)) & 1) != SS$_NORMAL) lib$stop(status); /* open data file */ if (cli$present(descr("DATA")) & 1) { if (mode == CREATE) if (((status = sys$create(&datfab)) & 1) != SS$_NORMAL) lib$stop(status); if (mode == MERGE) if (((status = sys$open(&datfab)) & 1) != SS$_NORMAL) lib$stop(status); if (((status = sys$connect(&datrab)) & 1) != SS$_NORMAL) lib$stop(status); } /* record the fields with Indexed attribute */ read_fields(file_spec); for (;;) { /* process all files in input spec, first one already found */ if ((src = fopen(file_spec, "r", "mbc=50", "mbf=20")) == NULL) { printf("Can't read input file %s\n", file_spec); exit(5); } printf("Building index for %s\n", file_spec); while (fgets(dat_record, sizeof(dat_record), src)) { if ((ptr = strchr(dat_record, '\r')) || (ptr = strchr(dat_record, '\n'))) *ptr = '\0'; /* remove newline */ if (strlen(dat_record) == 0) continue; /* skip blank lines */ if ((++count % 500) == 0) printf("%d\n", count); /* if /DATA requested, write .data file record */ if (cli$present(descr("data")) & 1) { strncpy(dat_key, dat_record, DAT_KEY_SIZE); datrab.rab$w_rsz = strlen(dat_record); if ((status = sys$put(&datrab)) != RMS$_NORMAL) { if ((status == RMS$_DUP) && (mode == MERGE)) { status = sys$find(&datrab); status = sys$update(&datrab); /* update the record */ } if (status != RMS$_NORMAL) { printf("DATA key (%d chars) %s\n", strlen(dat_key), dat_key); printf("DATA rec (%d chars) %s\n", strlen(dat_record), dat_record); lib$stop(status); } } } strcpy(dat_copy, dat_record); /* if this is an indexed field, write index record(s) */ strncpy(field, dat_copy + ID_SIZE, FIELD_SIZE); field[FIELD_SIZE] = '\0'; if (field_attrib[atoi(field)] & ATTR_INDEXED) { for (ptr = dat_copy; *ptr; ptr++) if (iscntrl(*ptr)) *ptr = ' '; /* convert tabs to spaces */ while ((strlen(dat_copy) > 0) && (dat_copy[strlen(dat_copy)-1] == ' ')) dat_copy[strlen(dat_copy)-1] = '\0';/* remove trailing blanks */ for (ptr = dat_copy; *ptr; ptr++) *ptr = _tolower(*ptr); /* force lowercase */ index_words(dat_copy, &idxrab, &datrab); } } fclose(src); status = lib$find_file(&file_dsc, &file_spec_dsc, &context, 0, 0, 0, 0); if ((status & 1) == 0) { lib$find_file_end(&context); break; } ptr = strchr(file_spec, ' '); if (ptr) *ptr = '\0'; /* chop off trailing spaces */ } if (cli$present(descr("data")) & 1) sys$close(&datfab); sys$close(&idxfab); } /* break data field into words and write them to index file */ void index_words(char *line, struct RAB *idxptr, struct RAB *datptr) { char data[DATA_SIZE + 2], field[FIELD_SIZE + 1], id[ID_SIZE + 1]; char *cp, *cp2; int status; strncpy(id, line, ID_SIZE); id[ID_SIZE] = '\0'; strncpy(field, line + ID_SIZE, FIELD_SIZE); field[FIELD_SIZE] = '\0'; strncpy(data, line + ID_SIZE + FIELD_SIZE + SEQ_SIZE + ATTR_SIZE, DATA_SIZE); data[DATA_SIZE] = '\0'; /* special hack to omit indexing the email domain */ if ((strcmp(field, EMAIL_FIELD) == 0) && (cp = strchr(data, '@'))) *cp = '\0'; #if NAME_HACK if (strcmp(field, NAME_FIELD) == 0) /* only edit name field */ for (cp = data; *cp; cp++) { /* apply any special editing to names */ if (*cp == '-') *cp = ' '; /* index both hyphenated names */ if (*cp == '\'') strcpy(cp, cp+1); /* squeeze out apostrophe */ } #endif strcat(data, " "); /* line ends with a space */ cp = data; while(cp2 = strchr(cp, ' ')) { /* break at space boundary */ *cp2 = '\0'; if (strlen(cp) > KEYWORD_SIZE) printf("Truncating %d character word /%s/ to %d characters\n", strlen(cp), cp, KEYWORD_SIZE); if (strlen(cp) >= MIN_KEYWORD) { sprintf(idx_record, "%-*.*s%s%s", KEYWORD_SIZE, KEYWORD_SIZE, cp, field, id); strncpy(idx_key, idx_record, IDX_KEY_SIZE); idx_key[IDX_KEY_SIZE] = '\0'; if ((field_attrib[atoi(field)] & ATTR_UNIQUE) && ((status = sys$get(idxptr)) & 1)) /* unique record found? */ printf("Omit duplicate unique record: %s\n", line); else { idxptr->rab$w_rsz = IDX_RECORD_SIZE; if (((status = sys$put(idxptr)) & 1) == 0) if (status != RMS$_DUP) lib$stop(status); } } cp = cp2 + 1; } } char * get_field(char *ptr, char *field) { int ind; for (ind= 0; *ptr != '\0' && *ptr != ':'; ptr++, ind++) field[ind] = _tolower(*ptr); field[ind] = '\0'; if (*ptr == ':') ptr++; /* skip over terminating ":" */ return ptr; } void read_fields(char *file) { FILE *cnf; char *ptr, config[256], line[256], field[128]; int ind, field_num; short leng; $DESCRIPTOR(config_dsc, config); if (cli$present(descr("configuration")) & 1) { /* if /config */ cli$get_value(descr("configuration"), &config_dsc, &leng); config[leng] = '\0'; } else { /* no /config switch */ strcpy(config, file); ptr = strrchr(config, '.'); if (ptr) *ptr = '\0'; strcat(config,".cnf"); } for (ind = 0; ind < MAX_FIELD; ind++) field_attrib[ind] = 0; /* init array */ if ((cnf = fopen(config, "r", "dna=.cnf")) == NULL) { printf("Can't read config file %s\n", config); exit(7); } while (fgets(line, sizeof(line), cnf)) { ptr = strchr(line, '\n'); if (ptr) *ptr = '\0'; /* remove newline */ ptr = line; if ((*ptr == '#') || (*ptr == '\0')) /* comment or blank? */ continue; /* yes, skip line */ ptr = get_field(ptr, field); /* field number */ field_num = atoi(field); ptr = get_field(ptr, field); /* field name */ ptr = get_field(ptr, field); /* field size */ ptr = get_field(ptr, field); /* field description */ ptr = get_field(ptr, field); /* field option */ for (;;) { ptr = get_field(ptr, field); /* get attribute */ if (strlen(field) == 0) break; /* no more attributes */ /* attributes are unique to one letter */ for (ind = 0; ind < MAX_ATTRIBUTES; ind++) if (field[0] == _tolower(attributes[ind].name[0])) field_attrib[field_num] |= attributes[ind].value; } } fclose(cnf); } /* descr() creates character descriptor and returns * the address of the descriptor to the caller. */ # define N_DESCR 10 static struct dsc$descriptor_s str_desc[N_DESCR]; static int cur_descr = -1; struct dsc$descriptor_s *descr(char *string) { if(++cur_descr >= N_DESCR) cur_descr = 0; str_desc[cur_descr].dsc$w_length=(short)strlen(string); str_desc[cur_descr].dsc$b_dtype=DSC$K_DTYPE_T; str_desc[cur_descr].dsc$b_class=DSC$K_CLASS_S; str_desc[cur_descr].dsc$a_pointer=string; return (&str_desc[cur_descr]); } .