
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <ctype.h>
#include <unistd.h>
#include <math.h>
#include "utils.h"

#define COMMENT     '#'
#define MAX_WORDS   10000

typedef struct
  {
   char *woord;
   int clean;
   int df;
  } veld;

char comment;
veld algemeen[MAX_WORDS];
veld enkel[5000];
int records,x,rec,woordteller,found;
int enkelteller;
int laatste_record;


/* ---------------------------------------------------------------- */
void help()
{
printf("Usage: smallsmart [0ptions] filename\n");
printf("  options: BEZIG to be implemented\n");
printf("Copyright Hans Paijmans 1996,1997\n");
printf("version %s\n\n",version);
exit(1);
} 
/* ---------------------------------------------------------------- */

/* ---------------------------------------------------------------- */

void get_arguments (int argc,char **argv)
{

FILE *synf;
char *s,s1[100],s2[100],regel[REGEL_LENGTE],sent_name[100];
char o,n;
time_t seconds;
struct tm *t2;
extern int optind;
extern char *optarg;

sent_name[0]=0;

while ((o=getopt(argc,argv,"dhl:L:nN:q:Q:o:O:r:R:s:S:v"))!=-1)
     switch (o)
       {
        case 'd': debug=1;
                 break; 
        case 'h': help();break;     
  
        case 'l': number_of_sentences=atoi(optarg);
                 if (number_of_sentences<1) 
                         error_exit("number of sentences must be >0","");

        case 'L': min_length=atoi(optarg);
                 if (min_length<1) 
                         error_exit("minimun length must be >0","");
                 break;
        case 'n': no_codes=1;
                 break;

        case 'N': n_grams=atoi(optarg);
                 if ((n_grams<1) || (n_grams>5))
                         error_exit("ngrams must be 0<n_grams<6","");
                 break;
        case 'q': check_stopwoorden=1;
                 if (optarg[strlen(optarg)-1]=='!') 
                    {
                    optarg[strlen(optarg)-1]=0;
                    stop_tags=1;
                    }
		 /* printf("%s\n",optarg); */
                 aantal_stopwoorden=lees_woordenlijst(optarg,&stop_lijst[0]);
                 break;
        case 'Q': check_stopwoorden=-1;
                 if (optarg[strlen(optarg)-1]=='!') 
                    {
                    optarg[strlen(optarg)-1]=0;
                    stop_tags=1;
                    }
                 aantal_stopwoorden=lees_woordenlijst(optarg,&stop_lijst[0]);
                 break; 


        case 'o': do_sig_check=1;
                 sig_check=lees_woordenlijst(optarg,&sig_lijst[0]);
                 break;
        case 'O': do_sig_check=-1;
                 sig_check=lees_woordenlijst(optarg,&sig_lijst[0]);
                 break;

        case 'r': strcpy(recsep,optarg);
                 break;
        case 'R': write_rec=1;
                 if ((rec_fuit=fopen(optarg,"w"))==NULL)
                                error_exit("no recsep-file opened:",optarg);
                 break;

 
        case 's': complete_lines=0;
                 number_of_tokens=atoi(optarg);
                 if (!number_of_tokens) error_exit("no length given ","");
                 break;
        case 'S': sentences=1;complete_lines=0;
                 if (!number_of_tokens) number_of_tokens=20;
                 strcpy(sent_name,optarg);
                 if ((fuit=fopen(sent_name,"r"))) 
                          error_exit("file exists: ",sent_name);
                 if ((fuit=fopen(sent_name,"w"))==NULL) 
                          error_exit("no sentence-file opened: ",sent_name);
                 break;
        case 'v': verbose=1;
                 break;

       }

if (optind<argc) strcpy(tekstfile,argv[optind++]);
if (!recsep[0]) strcpy(recsep,"rec");
if (debug)
   {
   check_all();
   printf("textfile:          %s\n",tekstfile);
   printf("sent_name:         %s\n",sent_name);
   }
}


/* ---------------------------------------------------------------- */
int print_woorden(int rec, int enkelteller, char *modus)
{
int x,y,df;
double new_tf,max_tf,new_wt,sum,f,nw,nt;
double df_rij[MAX_WORDS];
double tf_rij[MAX_WORDS];
double nw_rij[MAX_WORDS];

max_tf=0;
sum=0;
if (!enkelteller) return;

/* --- first lets compute the highest frequency for this record --- */

for (x=0;x<enkelteller;x++) 
  {
  f=(float)enkel[x].df;
  if (f>max_tf) max_tf=f;
  }

/* ---- get the document frequency of each term in this vector --- */

for (x=0;x<enkelteller;x++) 
    for (y=0;y<woordteller;y++) 
         if (!strcmp(enkel[x].woord,algemeen[y].woord))
	   {df_rij[x]=algemeen[y].df;y=woordteller;}


/* --- now compute the new_tf for every term in the vector --- */

for (x=0;x<enkelteller;x++) tf_rij[x]=0.5 + 0.5 * (enkel[x].df / max_tf);

/* --------- now compute all new weights for the vector -------------*/

sum=0;
for (x=0;x<enkelteller;x++)
                {
                nw_rij[x]=tf_rij[x] * log ((records+1) / df_rij[x]);
                sum+=pow(nw_rij[x],2);
                }

  

for (x=0;x<enkelteller;x++)
  {

  switch (modus[0])
    {
    case 'a': new_tf = tf_rij[x];
              break;
    }

  switch (modus[1]) 
    {
    case 't': new_wt=nw_rij[x];
              break;
    }



  switch (modus[2]) 
    {
    case 'c':new_wt=nw_rij[x] / sqrt(sum);
             break;
    }



    printf("%d %f %s\n",rec,new_wt,enkel[x].woord);
    }
}
 


/* ---------------------------------------------------------------- */
main(int argc,char **argv)
{
char o, regel[REGEL_LENGTE], *woord;
char separators[20]="\n.[]();, ";


FILE *fin;

comment=COMMENT;
laatste_record=-1;
initialize(argv[0]);
get_arguments(argc,argv);

if ((fin=fopen(tekstfile,"r"))==NULL) error_exit("no txt-file: ",tekstfile);


/* -------------------- first run: count words --------------------- */ 
woordteller=0;

while (!feof(fin))
  {
  bezig=get_next_line(fin,regel);
  if (debug) printf("na get_next_line: %s\n",regel);
  woord=strtok(regel,separators);
  if (record_number!=laatste_record) 
       {
        records++;laatste_record=record_number;
        for (x=0;x<woordteller;x++) algemeen[x].clean=1; 
       }

  while (woord)
    {  
    found=0;
    x=0;
    while (x<woordteller) 
          {
          if (!strcmp(woord,algemeen[x].woord))
	    {
            found=1;
	    if (algemeen[x].clean) {algemeen[x].df++;algemeen[x].clean=0;}
            }
          x++;
          }

    if (!found) 
        {
        algemeen[woordteller].woord=(char*)malloc(strlen(woord)+1);
        strcpy(algemeen[woordteller].woord,woord);
        algemeen[woordteller].clean=0;
        algemeen[woordteller].df=1;
        woordteller++;
        }
    woord=strtok(NULL,separators);
    }
  
  }



/*

for (x=0;x<woordteller;x++) printf("%d %s\n",
                           algemeen[x].df,algemeen[x].woord);

			   */

fseek(fin,0,SEEK_SET);

/* -------------------- second run: count words --------------------- */ 

enkelteller=0;
laatste_record=-1;
record_number=0;

while (!feof(fin))
  {
  bezig=get_next_line(fin,regel);
  if (debug) printf("%d na get_next_line: %s\n",record_number,regel);
  woord=strtok(regel,separators);
  
  if (record_number!=laatste_record) 
     {
     laatste_record=record_number;
     print_woorden(record_number-1,enkelteller,"atc");
     enkelteller=0;
     }

  else while (woord)
    {  
    found=0;
    x=0;
    while (x<enkelteller) 
          {
          if (!strcmp(woord,enkel[x].woord))
	    {
            found=1;
	    enkel[x].df++;
            }
          x++;
          }

    if (!found) 
        {
        if (enkel[enkelteller].df)
	  {
          free(enkel[enkelteller].woord);
          }
        enkel[enkelteller].woord=(char*)malloc(strlen(woord)+1);
        strcpy(enkel[enkelteller].woord,woord);
        enkel[enkelteller].df=1;
        enkelteller++;
        }
    woord=strtok(NULL,separators);
    }
  }

print_woorden(record_number,enkelteller,"atc");
 
exit(0);
}












