
/*

Usage: sent_weight discrim text

Computes average weight/sentence.

-----------------------------------------
first inputfile (weights):

EITHER series discrimination values

     represm   0.04465
     walk   0.06890

OR series doscument-word weights

     brad.txt represent 11.000000
     brad.txt every 2.000000
     brad.txt stabilize 1.000000

second inputfile (texts):

    0 (M)([ 0]  [times]sacj 

    1 (M)(F)( )[ 1]  July 1992 

    2 (M)( )(L)[ 0]  8 10 1992 70 

    3 (M)(F)(L)[ 1]  

    4 (T)(F)( )[ 1]  Qualitative Reasoning: An Introduction 

-----------------------------------------

The program will decide from the number of fields in the first line
if the weights-file is a word-weight or a document-word-weight table.
In the second case the name of the text-file should match one of the
document-names.

If lines from the second file start with a number, it is supposed to be
a linenumber. If not, the linenumber counter is increased automatically.
This will fail if lines start for some reason with an integer number 
(which is very rare - I hope)

If words are tagged, they should be tagged in both files and the tag
should be a *prefix*, separated by an underscore. If so, they may be
counted separately on v(erb), n(oun) and (ad)j.

The averages can be computed on all tokens in a sentence or only on
tokens with a non-zero weight.

Sentences can be either complete inputlines or a fixed number of tokens.
In the second case non-alphanumeric tokens are suppressed.

Complete inputlines exist of:
possibly a linenumber, some codes ended by a ']' and the sentence.

-w<0,1,2,3,13>
 w0:   The chains can be computed on all tokens in a sentence or only on
   tokens with a non-zero weight in the weights-file. 

 w1,2,3,13:   Furthermore they can be restricted (only in tagged files)
   to nouns (1), verbs (2), adjectives (3) or nouns  and adjectives (13).
   If words are tagged, they should be tagged in both files and the tag
   should be a *prefix*, separated by an underscore.

-l<n>
   Sentences can be either complete inputlines or a fixed number of tokens.
   In the second case non-alphanumeric tokens are suppressed. The option 'l'
   causes the program to take groups of <n> tokens as a line.

-v
   Verbose. Mainly for debugging.

-d 
   Debug. Some more debug info.
*/

#include <math.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <ctype.h>
#include <unistd.h>
#include "utils.h"

char   regel[1024];
int    n,x,complete;
int    nouns,verbs,adjs;
float  fl;

/* ---------------------------------------------------------------- */
void help()
{
printf("Usage: sent_weight [-options] text-file weights-file\n");
printf("prints sentence_number and mean_of_weights to stdout\n");
printf("       A    : print date & actual commandline, preceeded by %c-sign\n",comment);
printf("              will also print the recordnumber preceeded by %c-sign\n",comment);

printf("       a    : make words of a-z only (default is alphanumeric [a-z0-9])\n");
printf("       c    : count (adds count of all_words, weigted_words, nouns,adjs,verbs)\n");

printf("       d    : debug info\n");
printf("       h    : Help (this message)\n");
printf("       i    : add sentence-codes to output\n");
printf("       n    : text has no codes in front of sentence ended by ']')\n");
printf("       L<n> : minimum length of words to be considered (default=2)\n");

printf("       o<name> : discard sentences with strings from file 'name'\n");
printf("       O<name> : discard sentences without strings from 'name'\n");

printf("       p     : print statistics with other output)\n");
printf("       P     : print statistics (exclusive)\n");

printf("       q<name>[!]: use list of stop words ('!'=ignore tags).\n");
printf("       Q<name>: ignore all but list of obligate words\n");

printf("       r<recsep>  : recognize recsep as record-separator.\n");
printf("       R<name>  : make file with record- and line-numbers.\n");

printf("       s<n> : use artificial lines of <n> tokens in stead of lines\n");
printf("              default= 20 tokens\n");
printf("       S<name>: print this artificial file to file name\n");


printf("       v    : verbose\n");
printf("       w0   : compute all (only with weights-file)\n");
printf("       w1   : compute only nouns\n");
printf("       w2   : compute only verbs\n");
printf("       w3   : compute only adjs\n");
printf("       w13  : compute only nouns and adjs\n");
printf("              note that w1-w13 make only sense with tagged files\n");
printf("              the same is true for option c.\n");

printf("Copyright Hans Paijmans 1995,1996\n");
printf("version 2.01, 20 sept. 1996\n");
exit(1);
} 

/* ---------------------------------------------------------------- */
void get_arguments(int argc,char **argv)
{
char *s,o,n;
extern int optind;
extern char *optarg;

if ((argc<2) || (!strcmp(argv[1],"-h"))) help();

complete=0;
while ((o=getopt(argc,argv,"AacdhiL:no:O:Ppq:Q:r:R:s:S:vw:"))!=-1)

     switch (o)
       {
       case 'A': printf("%c",comment);
                 for (n=0;n<argc;n++) printf("%s ",argv[n]);
                 printf("\n");
                 print_rec=1;
                 break;

       case 'a': /* alphanumerics, or alpha's only */
                 word_control=1;break;
       case 'c': complete=1; break;
       case 'd': debug=1; break;
       case 'h': help();break;
       case 'i': add_codes=1;break;

       case 'L': min_length=atoi(optarg);
                 if (min_length<1) error_exit("minimun length must be >0","");
                 break;



       case 'n': no_codes=1;break;

       case 'o': do_sig_check=1;
                 sig_check=lees_woordenlijst(optarg,&sig_lijst[0]);
                 break;
       case 'O': do_sig_check=-1;
                 sig_check=lees_woordenlijst(optarg,&sig_lijst[0]);
                 break;

       case 'p': statistics=1;break;
       case 'P': statistics=2;break;

       case 'q': check_stopwoorden=1;
                 if (optarg[strlen(optarg)-1]=='!') 
                    {
                    optarg[strlen(optarg)-1]=0;
                    stop_tags=1;
                    }
                 aantal_stopwoorden=lees_woordenlijst(optarg,&stop_lijst[0]);
                 break;
       case 'Q': check_stopwoorden=-1;
                 aantal_stopwoorden=lees_woordenlijst(optarg,&stop_lijst[0]);
                 break; 


       case 'r': strcpy(recsep,optarg);
                 break;
       case 'R': write_rec=1;
                 if ((rec_fuit=fopen(optarg,"w"))==NULL)
                                error_exit("no recsep-file opened:",optarg);
                 break;


       case 's': complete_lines=0;
                 number_of_tokens=atoi(optarg);
                 if (number_of_tokens==0) number_of_tokens=20;break;
       case 'S': sentences=1;
                 if ((fuit=fopen(optarg,"w"))==NULL) 
                                error_exit("no sentence-file opened: ",optarg);
                 break;


       case 'v': verbose=1;break;
       case 'w': only_weighted_words=atoi(optarg);break;

       }
/* --- */


if (number_of_tokens==0) number_of_tokens=20;

if (optind<argc) strcpy(tekstfile,argv[optind++]);

if (optind<argc) 
    {
    strcpy(indexfile,argv[optind++]);
    if (only_weighted_words==-1) only_weighted_words=0;
    weight_filetype = check_file_type(indexfile);
    }

if ((write_rec) && (recsep[0]==0))
    error_exit("attempted R-option without specifying record separator","");



if (debug)
   {
   check_all();
   printf("statistics           %d\n",statistics);
   printf("indexfile            %s\n",indexfile);
   printf("tekstfile            %s\n",tekstfile);
   }

}


/* ---------------------------------------------------------------- */


/* ---------------------------------------------------------------- */


/* ---------------------------------------------------------------- */
/*
counts verbs, nouns, adjs in public variables


*/
float count_word_categories(char *regel)
{
float alles,langste_gewicht,temp;
char  *token,*s;
char  langste_woord[50];
int   lengte;

nouns=verbs=adjs=0;
temp=.0;
number_of_words=number_of_weighted_words=0;
s=NULL;
x=0;
if (no_codes) s=strchr(regel,']');
if (s) {s++;strcpy(regel,s);}

if (verbose) printf("\nregel: %s\n",regel);
token=strtok(regel,".],:; ");

while (token)
    {
    x++;
    switch (token[0])
	      {
	      case 'n': nouns++;break;
	      case 'v': verbs++;break;
	      case 'j': adjs++;break;
	      }

    n=langste_woord[0]=0;langste_gewicht=.0;
    while (n<num_of_weights)
              {
	      lengte=strlen(serie[n].concept);
              if (!strncmp(token,serie[n].concept,lengte))
	             {
                     if (strlen(langste_woord)<lengte) 
                            {
                            strcpy(langste_woord, serie[n].concept);
                            langste_gewicht=serie[n].w;
		            }
                     if (strlen(token)==lengte) n=num_of_weights;
                     }
              n++;
              }

    if (verbose) 
              {
              printf("num_of_weights %d; token: %s \tgewicht - %f\n",num_of_weights,token, langste_gewicht);
              }

    switch (only_weighted_words)
              {
              case -1 : 
              case  0 : temp+=langste_gewicht;break;
              case  1 : if (token[0]=='n') temp+=langste_gewicht;
                            break;
              case  2 : if (token[0]=='v') temp+=langste_gewicht;
                            break;
              case  3 : if (token[0]=='j') temp+=langste_gewicht;
                            break;
              case  13: if ((token[0]=='n') || (token[0]=='j'))
                                          temp+=langste_gewicht;
                            break;
              }

    number_of_words++;
    if (langste_gewicht) number_of_weighted_words++;
    token=strtok(NULL,".],:; ");
    }
number_of_words--;

if (debug)
  {
  printf("total weights: \t%f\nweighted_words: \t%d\nall words: \t%d\n",
                         temp,number_of_weighted_words,number_of_words);
  }
    
switch (only_weighted_words)
   {
   case -1 : if (number_of_words) 
                   return(temp/(float)number_of_words);
             break;
   case  0 : if (number_of_weighted_words) 
                   return(temp/(float)number_of_weighted_words);
	     break; 
   case  1 : if (nouns) return (temp/nouns);
             break;
   case  2 : if (verbs) return(temp/verbs);
             break;
   case  3 : if (adjs) return(temp/adjs);
             break;
   case  13: if (nouns+adjs) return(temp/nouns+(float)adjs);
             break;
   }

return 0;
}

/* ---------------------------------------------------------------- */
main(int argc,char **argv)
{
float average,sum;
int n,y,aantal;
char *s;

initialize(argv[0]);
get_arguments(argc,argv);


if (((weight_filetype>=0) && (weight_filetype<2))
   ||
  ((weight_filetype>=4) && (weight_filetype<6)))
           num_of_weights=haal_gewichten(indexfile,tekstfile);

if ((fin=fopen(tekstfile,"r"))==NULL) error_exit("no txt-file",tekstfile);
no_codes=check_codes(fin);
aantal=1;sum=0;

while ((!feof(fin)) && (bezig))
      {
      bezig=get_next_line(fin,regel);
      if (debug) printf("%s\n",regel);

      if (strlen(regel)>1) 
         { 
          average=count_word_categories(regel);
          sum+=average;
          aantal++;
          if (statistics<2)
	      {
              printf(" %4d       %f  ",regel_nummer,average);
              if (complete) printf("%3d   %3d   %3d   %3d   %3d  ",
                    number_of_words,number_of_weighted_words,nouns,verbs,adjs);
              if (add_codes) printf("%s\t",code);
              printf("\n");
              }
         }

    }
if (statistics==2) printf("mean %f\n",sum / aantal);

fclose (fin);
fclose(fuit);
fclose(rec_fuit);
exit(0);
}





