
#define MAX_WORDS  10000
#define SEEK_SET   0
#define SYN_LIST_LENGTH 5000

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <ctype.h>
#include <unistd.h>
#include <time.h>
#include <getopt.h>
#include "utils.h"


typedef struct
   {
   int last_sentence;
   char concept[WORD_LENGTH];
   } chain;

typedef struct
   {
   char *word1;
   char *word2;
   } syn;

chain  active_chains[MAX_WORDS];
syn    syn_list[SYN_LIST_LENGTH];

char   regel[1024],syn_name[100];
int    n,x;
float  nouns,verbs,adjs;
float  fl;

int    chain_length,top,synonyms;
float  treshold;

/* ---------------------------------------------------------------- */
void help()
{
printf("Usage: chains [-options] text-file [weights-file]\n");
printf("prints sentence_number and # of active chains to stdout\n");
printf("Options:\n");
printf("       A    : print date & actual commandline, preceeded by %c-sign\n",comment);
printf("              will also print the recordnumber preceeded by %c-sign\n",comment);
printf("       a    : make words of a-z only (default is alphanumeric [a-z0-9])\n");
printf("       c<n> : maximum chainlength (default 6)\n");
printf("       d    : debug info\n");
printf("       h    : Help (this message)\n");
printf("       i    : add sentence-codes to output\n");
printf("       n    : text has no codes in front of sentence ended by ']')\n");
printf("       L<n> : minimum length of words to be considered (default=2)\n");

printf("       o<name>  : discard sentences with strings from file 'name'\n");
printf("       O<name>  : discard sentences without strings from 'name'\n");

printf("       q<name>[!]  : use list of stop words. ('!' to ignore tags).\n");
printf("       Q<name>  : ignore all but list of obligate words\n");

printf("       r<recsep>: recognize recsep as record-separator.\n");
printf("       R<name>  : make file with record- and line-numbers.\n");

printf("       s<n>     : use artificial lines of <n> tokens in stead of lines\n");
printf("                  default= 20 tokens\n");
printf("       S<name>  : print this artificial file to file name\n");

printf("       t<float> : set treshold for weight\n");
printf("       T<name>  : use 'name' as a list of synonyms.\n");

printf("       v    : verbose\n");
printf("       w0   : compute all (only with weights-file)\n");
printf("       w1   : compute only nouns\n");
printf("       w2   : compute only verbs\n");
printf("       w3   : compute only adjs\n");
printf("       w13  : compute only nouns and adjs\n");
printf("              note that w1-w13 make only sense with tagged files\n");
printf("Copyright Hans Paijmans 1995, 1996\n");
printf("Version 2.01, 30 sept. 1996\n\n");
exit(1);
} 

/* ---------------------------------------------------------------- */


/* ---------------------------------------------------------------- */
void get_arguments(int argc,char **argv)
{
FILE *synf;
char *s,s1[100],s2[100],regel[1024],sent_name[100];
char o,n;
time_t seconds;
struct tm *t2;
extern int optind;
extern char *optarg;

treshold=-1;
chain_length=6;
synonyms=0;
sent_name[0]=0;

while ((o=getopt(argc,argv,"Aac:dhiL:no:O:q:Q:r:R:s:S:t:T:vw:"))!=-1)
     switch (o)
       {
       case 'A': printf("%c ",comment);
                 for (n=0;n<argc;n++) printf("%s ",argv[n]);
                 printf("\n");
                 print_rec=1;
                 break;
       case 'a': /* alphanumerics, or alpha's only */
                 word_control=1;
		 break;
       case 'c': chain_length=atoi(optarg); 
	         break; 
       case 'd': debug=1; 
                 break;
       case 'h': help();
                 break;
       case 'i': add_codes=1;
                 break;
       case 'L': min_length=atoi(optarg);
                 if (min_length<1) error_exit("minimun length must be >0","");
                 break;
       case 'n': no_codes=1;
                 break;

       case 'q': check_stopwoorden=1;
                 if (optarg[strlen(optarg)-1]=='!') 
                    {
                    optarg[strlen(optarg)-1]=0;
                    stop_tags=1;
                    }
		 /* printf("%s\n",optarg); */
                 aantal_stopwoorden=lees_woordenlijst(optarg,&stop_lijst[0]);
                 break;
       case 'Q': check_stopwoorden=-1;
                 aantal_stopwoorden=lees_woordenlijst(optarg,&stop_lijst[0]);
                 break; 

       case 'o': do_sig_check=1;
                 sig_check=lees_woordenlijst(optarg,&sig_lijst[0]);
                 break;
       case 'O': do_sig_check=-1;
                 sig_check=lees_woordenlijst(optarg,&sig_lijst[0]);
                 break;

       case 'r': strcpy(recsep,optarg);
                 break;
       case 'R': write_rec=1;
                 if ((rec_fuit=fopen(optarg,"w"))==NULL)
                                error_exit("no recsep-file opened:",optarg);
                 break;


       case 's': complete_lines=0;
                 number_of_tokens=atoi(optarg);
                 if (!number_of_tokens) error_exit("no length given ","");
                 break;
       case 'S': sentences=1;complete_lines=0;
                 if (!number_of_tokens) number_of_tokens=20;
                 strcpy(sent_name,optarg);
                 if ((fuit=fopen(sent_name,"r"))) 
                          error_exit("file exists: ",sent_name);
                 if ((fuit=fopen(sent_name,"w"))==NULL) 
                          error_exit("no sentence-file opened: ",sent_name);
                 break;


       case 't': /* treshold vor weights */
                if (only_weighted_words==-1) only_weighted_words=0;
	        treshold=atof(optarg);
                if (!treshold) error_exit("treshold should be > nul","");
                break;

       case 'T': synonyms=1;strcpy(syn_name,optarg);
                 break;

       case 'v': verbose=1;
                 break;
       case 'w': only_weighted_words=atoi(optarg);
                 break;
       case ':': printf("parameter missing...\n");
                 break;

       }
       
if (synonyms)
  {                 
  if ((synf=fopen(syn_name,"r"))==NULL) 
                                error_exit("no synonym-file found: ",syn_name);
  synonyms=0;
  while (!feof(synf))
    {
    fgets(regel,100,synf);
    if (!((regel[0]==comment) || (strlen(regel)<2)))
	{
        sscanf(regel,"%s %s",&s1,&s2);
        syn_list[synonyms].word1=malloc(strlen(s1)+1);
        syn_list[synonyms].word2=malloc(strlen(s2)+1);
	strcpy(syn_list[synonyms].word1,s1);
        strcpy(syn_list[synonyms].word2,s2);
        synonyms++;
        }
    }  
  fclose(synf);
    /* load a table of synonyms here */
  }

if (number_of_tokens==0) number_of_tokens=20;

if (optind<argc) strcpy(tekstfile,argv[optind++]);

if (optind<argc) 
    {
    strcpy(indexfile,argv[optind++]);
    if (only_weighted_words==-1) only_weighted_words=0;
    weight_filetype = check_file_type(indexfile);
    }

if ((write_rec) && (recsep[0]==0))
    error_exit("attempted R-option without specifying record separator","");


if (debug)
   {
   check_all();
   printf("chain_length:      %d\n",chain_length);
   printf("treshold:          %f\n",treshold);
   printf("indexfile:         %s\n",indexfile);
   printf("textfile:          %s\n",tekstfile);
   printf("sent_name:         %s\n",sent_name);
   }
}
/* ---------------------------------------------------------------- */
/*


*/
int maak_chains(char *regel,int zin_nummer)
{
float langste_gewicht,temp;
char  *token,*s,holder[200];
char  langste_woord[50];
int   lengte,num,found,x,n,accepted;

number_of_words=number_of_weighted_words=0;

num=0;

if ((!no_codes) && (complete_lines)) 
   {
   s=strchr(regel,']');
   if (s) {s++;strcpy(regel,s);}
   }

if (verbose) printf("\nrec: %d; line to chain: %s\n",record_number,regel);

token=strtok(regel,".]\t\n ");

while (token)
    {
    strcpy(holder,token);
       /* check of holder (woord uit de zin) al in de chains zit */
    found=0;  
    accepted=check_category(holder);
    langste_gewicht=1;

    /* replace this word by its synonym */
    
    if (synonyms)
      for (x=0;x<synonyms;x++)
         if (!strcmp(holder,syn_list[x].word1))
	    {
            strcpy(holder,syn_list[x].word2);break;
            }
      

    /* loop langs eventueel aanwezige index-file en zoek gewicht */

    if ((only_weighted_words>=0) && (num_of_weights) && (accepted))
         {
         n=langste_woord[0]=0;langste_gewicht=.0;

         while (n<num_of_weights)
              {
	      lengte=strlen(serie[n].concept);
              if (!strncmp(holder,serie[n].concept,lengte))
	             {
                     if (strlen(langste_woord)<lengte) 
                            {
                            strcpy(langste_woord, serie[n].concept);
                            langste_gewicht=serie[n].w;
		            }
                     if (strlen(holder)==lengte) n=num_of_weights+10;
                     }
              n++;
              }

         if (n<num_of_weights+10) langste_gewicht=-1;
         }

    if (!num_of_weights) 
             {
             langste_gewicht = treshold+1;
             strcpy(langste_woord,holder);
             }

    /* ------------ */

    if ((langste_gewicht > treshold) && (accepted))
         {
         for (x=0;x<top+1;x++)
           if (!strcmp(holder,active_chains[x].concept))
                {
                active_chains[x].last_sentence=zin_nummer;
                found=1;
                break;
                }

         if (!found)
           for (x=0;x<top+1;x++)
             if (active_chains[x].concept[0]==0)
	       {
               strcpy(active_chains[x].concept,holder);
               if (x>=top) top=x+1;
               active_chains[x].last_sentence=zin_nummer;
               break;
               }
         }

    if (debug) 
         {
         printf(">> x=%3d   %s ",x,holder);
         if (only_weighted_words>=0) 
             printf("n_of_w: %2d; l_gew.: %f; tresh: %f; acc: %d",
                    num_of_weights,langste_gewicht,treshold,accepted);
         printf("\n");
         }

    token=strtok(NULL,".]\t\n ");
    num++;
    }
/* en nu de ouwetjes eruit gooien... */

for (x=0;x<top;x++) 
      if (active_chains[x].last_sentence < zin_nummer-chain_length)
            {
            if ((verbose) && (active_chains[x].concept[0]))
                  printf("dropped: x=%3d   %s\n",x,active_chains[x].concept);
            active_chains[x].concept[0]=0;
            }

return (num);
}
/* hier was get_next_line */

/* ---------------------------------------------------------------- */
main(int argc,char **argv)
{
float difference;
int y,counter;
char *s,n;

top=0;
initialize(argv[0]);
get_arguments(argc,argv);

for (y=0;y<MAX_WORDS;y++) active_chains[y].concept[0]=0;

if (((weight_filetype>=0) && (weight_filetype<2))
   ||
  ((weight_filetype==4) || (weight_filetype==5)))
           num_of_weights=haal_gewichten(indexfile,tekstfile);


if ((fin=fopen(tekstfile,"r"))==NULL) error_exit("no txt-file: ",tekstfile);
no_codes=check_codes(fin);

while ((!feof(fin)) && (bezig))
      {
      bezig=get_next_line(fin,regel);

      if (debug) printf("na get_next_line: %s\n",regel);

      if (strlen(regel)>1) 
         { 
         
/* maak nieuwe chains of vul oude aan */

          maak_chains(regel,regel_nummer);

/* tel aantal actieve chains */

          counter=0;
          if (verbose) printf("top=%d\n",top);
          for (y=0;y<top;y++) 
             if (active_chains[y].concept[0]) 
                {
                if (verbose) printf("->%3d %s\n",
                      active_chains[y].last_sentence,active_chains[y].concept);
                counter++;
	        }
          if (add_codes) printf("%s ",code);
          printf("% 4d%7d  ",regel_nummer,counter  );
          printf("\n");

	  }
      }
fclose (fin);
fclose(fuit);
fclose(rec_fuit);
exit(0);
}


















