

#include <math.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <ctype.h>
#include <unistd.h>
#include <time.h>
#include <getopt.h>
#include "utils.h"


char   regel[REGEL_LENGTE];
int    n,x;
float  nouns,verbs,adjs;
float  fl;



/* ---------------------------------------------------------------- */
void help()
{
printf("Usage: sent_til [-options] text-file weights-file \n");
printf("prints sentence_number and 2-sentence-similarity to stdout\n");
printf("       A    : print date & actual commandline, preceeded by %c-sign\n",comment);
printf("              will also print the recordnumber preceeded by %c-sign\n",comment);
printf("       a    : make words of a-z only (default is alphanumeric [a-z0-9])\n");
printf("       d    : debug info\n");
printf("       h    : Help (this message)\n");
printf("       i    : add sentence-codes to output\n");
printf("       l<n> : use artificial sentences of n sentences at a time\n");
printf("       L<n> : minimum length of words to be considered (default=2)\n");

printf("       m0   : use dice's coefficient\n");
printf("       m1   : use Jaccard's coefficient\n");
printf("       m2   : use cosine coefficient\n");
printf("       n    : text has no codes in front of sentence ended by ']')\n");
printf("       N<n> : make n-grams of length<n>\n");
printf("       o<name> : discard sentences with strings from file 'name'\n");
printf("       O<name> : discard sentences without strings from 'name'\n");

printf("       q<name>[!]: use list of stop words ('!' to ignore tags).\n");
printf("       Q<name>: ignore all but list of obligate words\n");

printf("       r<recsep>  : recognize recsep as record-separator.\n");
printf("       R<name>  : make file with record- and line-numbers.\n");

printf("       s<n> : use artificial lines of <n> tokens in stead of lines\n");
printf("              default= 20 tokens\n");
printf("       S<name>: print this artificial file to file name\n");


printf("       v    : verbose\n");
printf("       w0   : compute all (only with weights-file)\n");
printf("       w1   : compute only nouns\n");
printf("       w2   : compute only verbs\n");
printf("       w3   : compute only adjs\n");
printf("       w13  : compute only nouns and adjs\n");
printf("              note that w1-w13 make only sense with tagged files\n");

printf("Copyright Hans Paijmans 1995,1996,1997\n");
printf("version %s\n\n",version);
exit(1);
} 


/* ---------------------------------------------------------------- */
void get_arguments(int argc,char **argv)
{
char *s,o,n;
extern int optind;
extern char *optarg;

if ((argc<2) || (!strcmp(argv[1],"-h"))) help();

while ((o=getopt(argc,argv,"Aadhil:L:m:nN:o:O:q:Q:r:R:s:S:vw:"))!=-1)

     switch (o)
       {
       case 'A': printf("%c",comment);
                 for (n=0;n<argc;n++) printf("%s ",argv[n]);
                 printf("\n"); 
                 print_rec=1;
                 break;

       case 'a': /* alphanumerics, or alpha's only */
                 word_control=1;break;
       case 'd': debug=1; break;
       case 'h': help();break;
       case 'i': add_codes=1;break;
       case 'l': number_of_sentences=atoi(optarg);
                 if (number_of_sentences<1) 
                         error_exit("number of sentences must be >0","");

       case 'L': min_length=atoi(optarg);
                 if (min_length<1) error_exit("minimun length must be >0","");
                 break;

       case 'm': method=atoi(optarg);break;
       case 'n': no_codes=1;break;

       case 'N': n_grams=atoi(optarg);
                 if ((n_grams<1) || (n_grams>5))
                         error_exit("n should be 0<n<6","");
                 break;
       case 'o': do_sig_check=1;
                 sig_check=lees_woordenlijst(optarg,&sig_lijst[0]);
                 break;
       case 'O': do_sig_check=-1;
                 sig_check=lees_woordenlijst(optarg,&sig_lijst[0]);
                 break;

       case 'q': check_stopwoorden=1;
                 if (optarg[strlen(optarg)-1]=='!') 
                    {
                    optarg[strlen(optarg)-1]=0;
                    stop_tags=1;
                    }
	
                 aantal_stopwoorden=lees_woordenlijst(optarg,&stop_lijst[0]);
                 break;
       case 'Q': check_stopwoorden=-1; 
                 if (optarg[strlen(optarg)-1]=='!') 
                    {
                    optarg[strlen(optarg)-1]=0;
                    stop_tags=1;
                    }
                 aantal_stopwoorden=lees_woordenlijst(optarg,&stop_lijst[0]);
                 break; 


       case 'r': strcpy(recsep,optarg);
                 break;
       case 'R': write_rec=1;
                 if ((rec_fuit=fopen(optarg,"w"))==NULL)
                                error_exit("no recsep-file opened:",optarg);
                 break;


       case 's': complete_lines=0;
                 number_of_tokens=atoi(optarg);
                 if (number_of_tokens==0) number_of_tokens=20;break;
       case 'S': sentences=1;
                 if ((fuit=fopen(optarg,"w"))==NULL) 
                                error_exit("no sentence-file opened: ",optarg);
                 break;


       case 'v': verbose=1;break;
       case 'w': only_weighted_words=atoi(optarg);break;

       }
/* --- */


if (number_of_tokens==0) number_of_tokens=20;

if (optind<argc) strcpy(tekstfile,argv[optind++]);

if (optind<argc) 
    {
    strcpy(indexfile,argv[optind++]);
    if (only_weighted_words==-1) only_weighted_words=0;
    weight_filetype = check_file_type(indexfile);
    }

if ((write_rec) && (recsep[0]==0))
    error_exit("attempted R-option without specifying record separator","");



if (debug)
   {
   check_all();
   printf("method:              %d\n",method);
   printf("indexfile            %s\n",indexfile);
   printf("tekstfile            %s\n",tekstfile);
   }

}


/* ---------------------------------------------------------------- */
/*
  creates the vectors for this sentence

*/
int maak_deze_vector(char *s)
{
float langste_gewicht,temp;
char  *token,regel[REGEL_LENGTE],*ss;
char  langste_woord[WORD_LENGTH];
int   lengte,num,langste_term,accepted;

nouns=verbs=adjs=.0;
number_of_words=number_of_weighted_words=0;
strcpy(regel,s);
num=0;
if ((no_codes) && (complete_lines)) 
     {
       ss=strchr(regel,']');
       if (ss) {ss++;strcpy(regel,ss);}
     }
if (debug) printf("\n%s\n",regel);



token=strtok(regel,".] ");
for (n=0;n<num_of_weights;n++) serie[n].d1=0;

while (token)                               /* word in the sentence */
    {
    switch (token[0])                       /* if we differ by word category */
	      {
	      case 'n': nouns++;break;
	      case 'v': verbs++;break;
	      case 'j': adjs++;break;
	      }

    accepted=check_category(token);

    langste_gewicht=1;
         n=langste_woord[0]=langste_term=0;langste_gewicht=.0;

    if ((only_weighted_words>=0) && (num_of_weights) && (accepted))
         {
	 /* loop langs eventuele index en zoek gewicht */
         while (n<num_of_weights) /* for every term in the vector */
              {                             
	      lengte=strlen(serie[n].concept);
              if (!strncmp(token,serie[n].concept,lengte))
	             {
                     if (strlen(langste_woord)<lengte) 
                            {
                            strcpy(langste_woord, serie[n].concept);
                            langste_gewicht=serie[n].w;
                            langste_term=n;
		            }
                     if (strlen(token)==lengte) n=num_of_weights;
                     }
              n++;
              }
	  }

    if ((verbose) && (accepted)) 
              {
              printf("%20s - %f - %d\n",token, langste_gewicht,accepted);
              }

    serie[langste_term].d1=1;

    if (accepted)
           {
           deze_vector[num]=(vector*)malloc(sizeof(vector));
           deze_vector[num]->weight=langste_gewicht;
           strcpy(deze_vector[num]->concept,token);
           num++;
           }

    token=strtok(NULL,".] ");

    }

return (num);
}



/* ---------------------------------------------------------------- */
main(int argc,char **argv)
{
float similarity;
int deze_vector_lengte, vorige_vector_lengte;

initialize(argv[0]);

get_arguments(argc,argv);

if (((weight_filetype>=0) && (weight_filetype<2))
      ||
    ((weight_filetype>=4) && (weight_filetype<6)))
           num_of_weights=haal_gewichten(indexfile,tekstfile);

if ((fin=fopen(tekstfile,"r"))==NULL) error_exit("no txt-file","");
no_codes=check_codes(fin);

vorige_vector_lengte=0;

while ((!feof(fin)) && (bezig))
      {
      bezig=get_next_line(fin,regel);

      if (strlen(regel)>1) 
         {
	   /*
         if ((complete_lines) && (!no_codes)) regel_nummer=atoi(regel);
         if (((complete_lines) && (no_codes)) || (!complete_lines)) regel_nummer++;
	 */
	  
          deze_vector_lengte=maak_deze_vector(regel);

          switch (method)
             {
             case 0: similarity = dice(vorige_vector, vorige_vector_lengte,
                                  deze_vector,deze_vector_lengte);
              break;
	     case 1: similarity = jaccard(vorige_vector, vorige_vector_lengte,
                                  deze_vector,deze_vector_lengte);
              break;
	     case 2: similarity = cosine();break;
             }

          printf(" %4d       %f\n",regel_nummer,similarity);
          for (x=0;x<deze_vector_lengte;x++) vorige_vector[x]=deze_vector[x];
          vorige_vector_lengte=deze_vector_lengte;
          for (x=0;x<num_of_weights;x++) serie[x].d2=serie[x].d1;
	  }
      }
fclose (fin);
fclose(fuit);
fclose(rec_fuit);
exit(0);
}












