
/*
last update KUB 1 october 1996
*/

#include <math.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <ctype.h>
#include <unistd.h>
#include "utils.h"

extern void help(void);

/* ================================================================ */
void initialize(char *s)
{
strcpy(executing_program,s);
add_codes           = 0;  /* leave codes in output */
bezig               = 1;  /* check for end of inputfile */
check_stopwoorden   = 0;  /* check if in list of stopwords */
do_sig_check        = 0;
comment             = COMMENT;
complete_lines      = 1;
debug               = 0;
ignore_codes        = 0;
no_codes            = 0;
n_grams             = 0;
method              = 0;
min_length          = 2;
number_of_tokens    = 0;
number_of_sentences = 1;
only_weighted_words = -1;
print_rec           = 0;
write_rec           = 0;
record_number       = 0;
recsep[0]           = 0;
regel_nummer        = 0;
sentences           = 0;
sig_check           = 0;
stop_tags           = 0;
verbose             = 0;
strcpy(version,"3.0");
weight_filetype     = -1;
word_control        = 0;
recstr[0]           = 0; /* keeper for record-separator */
}
/* ================================================================ */
void check_all(void)
{
printf("executing program  %s\n",executing_program);
printf("============================-\n");
printf("add_codes           %d\n", add_codes     );      
printf("bezig               %d\n", bezig);              
printf("check_stopwoorden   %d\n", check_stopwoorden );  
printf("complete_lines      %d\n", complete_lines);      
printf("debug               %d\n", debug);               
printf("do_sig_check        %d\n", do_sig_check);
printf("ignore_codes        %d\n", ignore_codes );       
printf("no_codes            %d\n", no_codes);            
printf("method              %d\n", method );             
printf("min_length          %d\n", min_length );  
printf("n_grams             %d\n",n_grams);       
printf("number_of_tokens    %d\n", number_of_tokens );   
printf("number_of_sentences %d\n", number_of_sentences );   
printf("only_weighted_words %d\n", only_weighted_words); 
printf("print_rec           %d\n", print_rec);        
printf("write_rec           %d\n", write_rec);        
printf("recsep              %s\n", recsep);

printf("regel_nummer        %d\n", regel_nummer);        
printf("sentences           %d\n", sentences );          
printf("sig_check           %d\n", sig_check );          
printf("stop_tags           %d\n", stop_tags);
printf("verbose             %d\n", verbose   );          
printf("word_control        %d\n", word_control);        
printf("weight_filetype     %d\n", weight_filetype);
printf("============================-\n");
}
/* ================================================================ */
/* utils.c
int check_codes(FILE *f)

Returns 1 if a ']' occurs in the line 
Leaves the filepointer at begin of file.

*/


int check_codes(FILE *f)
{
char regel[REGEL_LENGTE];
fgets(regel,REGEL_LENGTE-1,f);
fseek(f,0,SEEK_SET);
if (!strchr(regel,']')) return(1);else return(0);
}
/* ================================================================ */
/* utils.c
void error_exit(char *s,char *ss)

reports error and exits program.

*/

void error_exit(char *s,char *ss)
{
fprintf(stderr,"%s: Error - %s%s\n\n",executing_program,s,ss);
exit(1);
} 
/* ================================================================ */
/* utils.c
int scanfile(int filetype,FILE *f,char *file,char *woord,float *gewicht,int num)

Reads a line from a weights-file in either of four formats (filetype)
Returns zero if line is empty or is a comment.
Returns result of sscanf.
woord, gewicht and file are returned with the values in the file

*/

int scanfile(int filetype,FILE *f,char *file,char *woord,float *gewicht,int num)
{
char regel[1024];
fgets(regel,1023,f);

if (debug) printf("num %4d, gelezen uit index-->>%s\n",num,regel);
if (regel[0]==comment) return 0;
if (strlen(regel)<3) return 0;

switch (filetype)
  {
  case 0: return(sscanf(regel,"%s %f %s\n",woord,gewicht,file));
  case 1: return(sscanf(regel,"%f %s %f\n",gewicht,woord,file));
  case 2: return(sscanf(regel,"%s %f %s\n",file,gewicht,woord));
  case 3: return(sscanf(regel,"%s %s %f\n",file,woord,gewicht));
  case 4: return(sscanf(regel,"%s %f %s\n",file,gewicht,woord));
  case 5: return(sscanf(regel,"%s %s %f\n",file,woord,gewicht));
  }
}

/* ================================================================ */
/* utils.c

int check_file_type(char *indexnaam)

Checks filetype of index-file with name 'indexnaam'.
Closes file

?????    ??????          --> filetype 0     string   float
float    ??????          --> filetype 1     float    string

?????    ??????  ??????  --> filetype 2     num      float   string
?????    ??????  float   --> filetype 3     num      string  float

string   ??????  ??????  --> filetype 4     string   float   string
string   ??????  float   --> filetype 5     string   string  float

*/

int check_file_type(char *indexnaam)
{

char r1[WORD_LENGTH], r2[WORD_LENGTH], r3[WORD_LENGTH];
char regel[100];
float fl;
FILE *file;
int r;

if ((file=fopen(indexnaam,"r"))==NULL) error_exit("No index-file: ",indexnaam);

do
  {
  fgets(regel,99,file);
  r=sscanf(regel,"%s %s %s\n",&r1,&r2,&r3);
  }
while ((regel[0]==comment) || (strlen(regel)<3));
fclose(file);

if (r==2) weight_filetype=0;                          /* is discrim-index   */
if (atof(r1)) weight_filetype=1;                      /* eerste is float    */

if (r==3) weight_filetype=2;                          /* is dus SMART-index */
if ((r==3) && (atof(r3))) weight_filetype=3;          /* laatste is float;  */
if ((r==3) && (isalpha(r1[0]))) weight_filetype+=2;   /* eerste is filenaam */

if (debug) printf("-->>>r %d; r1 %s; r2 %s; \nregel %s; filetype %d\n",
                                    r,r1,r2,regel,weight_filetype);


return weight_filetype;
}

/* ================================================================ */

/* utils.c
int haal_gewichten(char *indexnaam,char *tekstnaam)


Builds dictionary with weights, returns number of words.
Lines starting with '#' and empty lines are ignored.

If tekstnaam != nul (in case of a three-column weightfile)
only the weights relevant for that file or record are collected.


!!!

calls:
   scanfile

*/

int haal_gewichten(char *indexnaam,char *tekstnaam)

{
char r1[WORD_LENGTH], r2[WORD_LENGTH], r3[WORD_LENGTH];
char woord[100],regel[100];
char *tekstfilenaam;
float fl;
FILE *file;
int ok,n,num,r,t;

if (debug) printf("HAAL_GEWICHTEN\n");

num=0;t=0;

/* ========-- check that the filename has no path left ============--- */

n=strlen(tekstnaam);
while ((n>=0) && (tekstnaam[n]!='/')) n--;
tekstfilenaam=&tekstnaam[++n];

/*
strcpy(tekstfilenaam,tekstnaam);
*/
if (debug) printf("tekstfilenaam %s\n",tekstfilenaam);
if ((file=fopen(indexnaam,"r"))==NULL) error_exit("No index-file ",indexnaam);

do
  {
  fgets(regel,99,file);
  r=sscanf(regel,"%s %s %s\n",&r1,&r2,&r3);
  }
while ((regel[0]==comment) || (strlen(regel)<3));


/* ========-- but if it is a single file with records ================ */

t=0;
while (!feof(file))
    {
    ok=scanfile(weight_filetype,file,r1,woord,&fl,num);

    if (
         (ok) && ((!strcmp(r1,tekstfilenaam)) || (weight_filetype<2))
       )
         {
         if ((debug) && (t<10)) 
               printf("res: %d num: %d wrd: %s wgt: %f, file %s\n",
                                                             ok, num,woord,fl,r1);
         serie[num].w=fl;
	 if (weight_filetype<2) strncpy(serie[num++].concept,woord,WORD_LENGTH);
         if (weight_filetype>1) 
             if (!strcmp(r1,tekstfilenaam)) 
                    {
                    strncpy(serie[num++].concept,woord,WORD_LENGTH);
                    if (debug) printf("from rec %s in index %s\n",r1,woord);
                    }

         }
    t++;
    }

fclose(file);
serie[num].w=.0;

return(num);

}
/* ================================================================- */
/* utils.c
int lees_woordenlijst(char *naam,char *lijst[])

Reads a file with strings (naam) in  list (lijst).
Lines beginning with '#' are ignored.
Returns number of not-empty lines read.

*/

int lees_woordenlijst(char *naam,char *lijst[])
{
int x;
FILE *f;
char s[50];
if ((f=fopen(naam,"r"))==NULL) error_exit("not found: ","");
x=0;
while (!feof(f))
  {
  fscanf(f,"%s\n",&s);
  if (s[0]!=comment)
     {
     lijst[x]=(char*)malloc(strlen(s)+1);
     strcpy(lijst[x],s);
     if (debug) if (x<10) printf("%s\n",lijst[x]);
     x++;
     }
  }
fclose(f);
return(x);
}
/* ================================================================- */
/* utils.c
char is_in_woordenlijst(char *s,int aantal,char *lijst[],int tags)

Checks whether string s is in list lijst.
if tags=1 tags are first removed.
Returns 1 if found else 0.

*/

char is_in_woordenlijst(char *s,int aantal,char *lijst[],int tags)
{
int x;
char ss[100],*p;

if (tags)
  {
  p=strchr(s,'_');
  if (p) strcpy(ss,++p);
  } else strcpy(ss,s);

x=0;
while (x<aantal)
  {
  if (!strcmp(ss,lijst[x])) return(1);
  x++;
  }
return(0);
}
/* ================================================================- */
/* utils.c
void clean_word(char *woord,char *s2)

Removes one or more strings s2 from front and back of string woord.
*/
void clean_word(char *woord,char *s2)
{
char *s;

while ((strlen(woord)>0) && (!isalnum(woord[strlen(woord)-1]))) 
              woord[strlen(woord)-1]=0;
s=&woord[0];
while ((strlen(s)) && (!isalnum(s[0]))) s++;
strcpy(woord,s);

}

/* ================================================================ */
char *lowercase(char *woord)
{
int y;

for (y=0;y<strlen(woord);y++) 
                         {
                         woord[y]=tolower(woord[y]);
                         }
}


/* ===========check for the word-category of the token, if any ========= */

/* utils.c
int check_category(char *token)

Returns whether first two characters of token are noun- verb- or adj-tag.

   w0   : compute all (only with weights-file)
   w1   : compute only weighted nouns
   w2   : compute only weighted verbs
   w3   : compute only weighted adjs
   w13  : compute only weighted nouns and adjs 


*/
 
int check_category(char *token)
{
int accepted;
accepted=1;

if (only_weighted_words>0) 
       {
       accepted=0;
       switch (only_weighted_words)
         {
         case 1 : if (!strncmp(token,"nn",2)) accepted=1;break;
         case 2 : if (!strncmp(token,"vb",2)) accepted=1;break;
         case 3 : if (!strncmp(token,"jj",2)) accepted=1;break;
         case 13 :  if ((!strncmp(token,"nn",2)) 
                     || (!strncmp(token,"vb",2)) 
                     || (!strncmp(token,"jj",2))) accepted=1;break;
         }
       }
return accepted;
}

/* ================================================================ */

char *make_n_grams(char *regel)
{
int l,x,y,z,xx,q;
char ngrams[2000][6];
char regel2[REGEL_LENGTE];

/* vervang regel door de markov-chain met lengte n_grams van die regel */

if (verbose) printf("\n>%s<\n",regel);
if ((recsep[0]) && (!strncmp(regel,recsep,strlen(recsep)))) 
  {
  if (debug) printf("rec gevonden\n");
  return regel;
  }

l=strlen(regel);
strcpy(regel2,"*****");
regel2[n_grams-1]=0;

y=n_grams-1;
for (x=0;x<strlen(regel);x++)
    if (isalnum(regel[x])) regel2[y++]=regel[x];
         else if (regel2[y-1]!='*') regel2[y++]='*';
regel2[y]=0;

strcpy(regel,"*****");
regel[n_grams-1]=0;
strcat(regel2,regel);

l= strlen(regel2);
q=0;
for (x=0;x<strlen(regel2)-n_grams+1;x++)
  {
  for (y=0;y<n_grams;y++) ngrams[x][y]=regel2[x+y];
  ngrams[x][n_grams]=0;
  q++;
  }

regel[0]=0;
for (x=0;x<q;x++) 
     if (strcmp(ngrams[x],ngrams[x+1])) 
         {
         regel=strcat(regel,ngrams[x]);
         regel=strcat(regel," ");
         if (strlen(regel)>(REGEL_LENGTE-2)) 
                            error_exit("adjust REGEL_LENGTE and recompile","");
         }
return regel;
}


/* ================================================================ */
/* utils.c
int only_alpha(char *w)

returns 1 if only alpha's in word

*/
int only_alpha(char *w)
{
int y;

for (y=0;y<strlen(w);y++) 
      {
      if (!isalpha(w[y])) return 0;
      }
return(1);
}
/* ================================================================ */
/* utils.c
int get_next_line (FILE *fin,char *regel)

Reads sentence from database.

Ignores empty lines and lines beginning with '#'  
also removes words that are exclusive non_alphabetics
and strips non_alphnums unless preceded by an underscore

calls:
   lowercase
   clean_woord
   is_in_woordenlijst
   check_rec_sep
*/

int get_next_line (FILE *fin,char *regel)
{
char woord[100],nostring[20],regel2[REGEL_LENGTE],tempstr2[10];
int  x,y,bezig,not_al,not_alnum,ok,code_eind,sig_flag;
char *s;
int einde_file;

bezig=1;sig_flag=0;
einde_file=0;
regel[0]=regel2[0]=0;
code[0]=0;
s=NULL;

if (complete_lines) 
      {
      for (x=0;x<number_of_sentences;x++)
          {
          do 
            {
            einde_file=0;
            fgets(regel,REGEL_LENGTE,fin);
            if (feof(fin)) einde_file=1; 
            }
          while ((!einde_file) && (strlen(regel)<2));

          if (regel[0]==comment) regel[0]=0;
          if (!no_codes) 
                   {
                   s=strchr(regel,']');
                   if (s) code_eind=&regel[0]-s;
                   strncpy(code,regel,19);
                   }
          if ((ignore_codes) && (s)) for (y=0;y<code_eind+1;y++) regel[y]=' ';
          strcat(regel2,regel);strcat(regel2," ");  
          }
       strcpy(regel,regel2);regel2[0]=0;
       if (debug) printf("net gelezen: %scode : %s\n",regel,code);
       }
      else /* if artificial lines */ 
	 {
         x=0;
         /* if last time a record-separator was detected, append it to regel */
         if (recstr[0])
               {
               strcat(regel,recstr);
               strcat(regel," ");
         /* and empty it for next time */
               recstr[0]=0;
               }
         while ((x<=number_of_tokens) && (bezig))
	       {
	       if (fscanf(fin,"%s",&woord)==EOF) {bezig=0;break;}
               clean_word(woord,"");
               if (recsep[0]) 
                       if (!strcmp(woord,recsep)) 
			    {
                            strcpy(recstr,recsep);
                            fscanf(fin,"%s",&woord);
                            strcat(recstr," ");
                            strcat(recstr,woord);
			    x=number_of_tokens;
                            continue;
                            }
               strcat(regel,woord);
               strcat(regel," ");
               x++;
               }

	  }
if (einde_file) return 0;

/* now we have read either a normal sentence or a block of n tokens */


if (strlen(regel)>2) 
   { 
   if ((complete_lines) && (!no_codes)) regel_nummer=atoi(regel);
   if (((complete_lines) && (no_codes)) || (!complete_lines)) regel_nummer++;
   }

if (recsep[0]) record_number=check_rec_sep(record_number,write_rec,recsep,regel);

lowercase(regel);

/* here we convert to n-grams, if necessary        */
/* we do this by replacing the spaces by '*'       */
/* and then cutting up the sentence in n-grams,    */
/* separated by spaces                             */

if (n_grams) regel=make_n_grams(regel);

s=strtok(regel,"?!;:.,()[]\t\n ");

while (s)
  {
  if (strlen(s)>=WORD_LENGTH) s[WORD_LENGTH]=0;

  ok=1;
  if ((!only_alpha(s)) && (word_control)) ok=0;

  if (strlen(s)<min_length) ok=0;

  if (is_in_woordenlijst(s,sig_check,&sig_lijst[0],stop_tags)) sig_flag=1;

  if (ok) /* check if this word is in the keep/discard list */
     {
     if (check_stopwoorden==1)
          if (is_in_woordenlijst(s,aantal_stopwoorden,&stop_lijst[0],stop_tags)) ok=0; 
     if (check_stopwoorden==-1)
          if (is_in_woordenlijst(s,aantal_stopwoorden,&stop_lijst[0],stop_tags)) ok=1; 
             else ok=0;
     }
  
  if (debug) printf("ok: %d  woord: %s\n",ok,s);

  if (ok) strcat(regel2,s);
  strcat(regel2," ");
  
  s=strtok(NULL,"!?;:.,[]()\t\n ");
  }

if ((do_sig_check==1) && (sig_flag==0))  strcpy(regel2,"  ");
    /* no words detected; line is not interesting */
if ((do_sig_check==-1) && (sig_flag==1)) strcpy(regel2,"  ");
    /* words were detected; line is not interesting */

if (debug) printf("sig_flag %d; regel2 %s\n",sig_flag, regel2);

strcpy(regel,regel2);

if (sentences) fprintf(fuit,"%d %s\n",regel_nummer,regel);

return bezig;
}


/* ================================================================ */
float dice(vector *vorige_vector[], int vvl,vector *deze_vector[],int dvl)
{
int n,m,equal;
equal=0;
for (n=0;n<dvl;n++) 
    for (m=0; m<vvl;m++)
         if (!strcmp(vorige_vector[m]->concept,deze_vector[n]->concept))
             equal++;


if (dvl+vvl) return (float)(equal)/(float)(dvl+vvl);
   else return 0;

}
/* ================================================================ */
float jaccard(vector *vorige_vector[], int vvl,vector *deze_vector[],int dvl)
{
int n,m,equal;
equal=0;
for (n=0;n<dvl;n++) 
    for (m=0; m<vvl;m++)
         if (!strcmp(vorige_vector[m]->concept,deze_vector[n]->concept))
             equal++;

if (dvl+vvl-(2*equal)) return (float)(equal)/(float)(dvl+vvl-(2*equal));
   else return 0;

}
/* ================================================================ */
float cosine(void)

{
int n;
float sum,sum1,sum2;
sum=sum1=sum2=.0;

for (n=0;n<num_of_weights;n++) 
          sum+=(serie[n].d1 * serie[n].w)*(serie[n].d2 * serie[n].w);

for (n=0;n<num_of_weights;n++)
          sum1+=((serie[n].d1 * serie[n].w)*(serie[n].d1 * serie[n].w));

for (n=0;n<num_of_weights;n++)
          sum2+=((serie[n].d2 * serie[n].w)*(serie[n].d2 * serie[n].w));
/* printf("%f %f %f\n",sum,sum1,sum2); */
if ((!sum1)||(!sum2)) return 0;

return (sum / sqrt(sum1 * sum2));

}

/* ================================================================ */

/* utils.c
int check_rec_sep(int record_number, int write_rec,char *recsep, char *regel)

Enters if recsep has a value. Checks for that recsep.
Loads new weights for that record if necessary.
If wanted, writes record- and filenumber to RECSEPS...

calls:
   haal_gewichten

*/
int check_rec_sep(int record_number, int write_rec,char *recsep, char *regel)
{
char tempstr[REGEL_LENGTE],tempstr2[256];
char *s;
int r;

if (debug) printf("REC %s - %s\n",recsep,regel);

if (strlen(regel))
      {
      strcpy (tempstr,regel);
      if (!strcmp(strtok(tempstr," "),recsep)) /* only first word */
              {
              s=strtok(NULL," ");
              if (s) r=atoi(s); else r=0;
              if (r) record_number=r; else ++record_number;
              sprintf(tempstr2,"%d",record_number);

              if ((weight_filetype==2) || (weight_filetype==3))
                         num_of_weights=haal_gewichten(indexfile,tempstr2);
              if (write_rec) 
                  fprintf(rec_fuit," %d\t%d\n",record_number,regel_nummer);
              if (print_rec) printf("%c rec %3d %4d\n",comment,record_number,regel_nummer);
              }
      }
return record_number;
}
