
/*

   PROGRAM NAME:  discrim.c

   PURPOSE:  1) Compute Discrimination Value of terms, 
            
   DISCLAIMER:   This is meant to be instructional material only, and it 
   is not guaranteed to perform as intended by its authors.  No warranty 
   of this software is stated or implied by the authors, editors, testers, 
   publishers, or suppliers of this code. (Or the hackers! -- DSD)


   INPUT FILES REQUIRED:

              a filename, file consists of sequences of:

                document#    term    weight
                For example: 1 bread 3.0       
         (multiple entries for any document should be grouped together )

             Output is not sorted, you must pipe it through
             the sort filter like this:
             discrim filename | sort -nr +1

   NOTES:    Filters such as stop lists and stemmers should be used
             before running this program.

             This started out as Padmini Srinivasan's 'select.c' program from
             the book  Information Retrieval: Data Structures and Algorithms
             Edited by William B. Frakes and Ricardo Baeza-Yates
             Prentice-Hall, 1992 ISBN: 0-13-463837-9. The source code for
             the book can be retrieved at ftp://ftp.vt.edu/pub/reuse/IR.code

       Dave Dubin made the following changes to the program:
                0) Took out everything not needed for term discrimination.
                1) The inverted file is no longer needed. It is created on
                   the fly from the direct file by forking a child process
                   to sort the records.
                2) The original program computed similarities in a hamming
                   space. Term weights now participate in the computations.
                3) The program will compute proximities using either the
                   cosine measure, angle in radians, or Euclidean distance.
       
       Hans Paijmans rewrote the handling of the indices. The original
                linked lists were discarded in favor of matrices. This 
                made the program *much* faster (May 1995). Other
                improvements by Dave Dubin were preserved.

     
   PARAMETERS TO BE SET BY USER:
            
             1) MAX_CONCEPTS - maximum number of keywords
                      
	     
*/

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <ctype.h>

#define COMMENT    '#'
#define COSINE     0
#define ANGLE      1
#define EUCLIDEAN  2
#define JACCARD    3

#define SEEK_SET 0
#define FIELDSEP " "

typedef struct
   {
   int hits;
   float avg_wt,weight,dv;
   int concept_num;
   char concept[40];
   struct cent *volgende;
   } cent;

char s[100];
cent *eerste_cent;

int  words = 0;
int number_of_docs;
int proximity;
int filetype;
int kaal;
int debug;
int print_only_centroid;

FILE *indexfile;
char  *s1,*s2,naam[40],vorige[40],filenaam[40],comment;
float f,*fl;
float **matrix;
float *centroid_array;
char  *woorden[10000];

/* ---------------------------------------------- */
int help(void)
{
printf("Usage : discrim [-options] index-file\n");

printf("Computes centroid and discrimination values from weighted indexfile.\n");
printf("index-file may be both file-word-weight or file-weight-word,\n");
printf("but should be sorted on file.\n");
printf("      A    : print date & actual commandline, preceeded by %c-sign\n",comment);
printf("     -c    : only print centroid to stdout\n");
printf("     -f<n> : force file recognition\n");
printf("        0  : weight in third column\n");
printf("        1  : weight in second column\n");
printf("     -m<n> : measure to use for similarity:\n");
printf("        0  : cosine (default)\n");
printf("        1  : angle in radians\n");
printf("        2  : euclidean distance\n");
printf("     -w    : only print words & discrimination values\n");

printf("Paijmans 1995, 1996\n");
printf("Version 2.01, 19 okt. 1996\n");
exit(1);
}
/* ---------------------------------------------- */
int error_exit(char *s)
{
printf("Error: %s, exiting...\n",s);
help();
exit(1);
}
/* ---------------------------------------------- */

int scanfile(FILE *indexfile, char *s1, char *s2, float *f)
{
char regel[1000];
int n;

do {
   n=fgets(regel,999,indexfile);
}
while (((regel[0]==comment) || (strlen(regel)<3)) && (!feof(indexfile)));
  
if (filetype==0) n=sscanf(regel,"%s %f %s",s1,f,s2);
   else if (filetype==1) n=sscanf(regel,"%s %s %f",s1,s2,f);

if (debug) printf("%d filetype %d - %s%s.%s.%f.\n",n,filetype,regel,s1,s2,*f);

if (feof(indexfile)) return EOF; else return n;
}
/* ---------------------------------------------- */
int add_to_centroid(char *s, float f)
{
cent *vorige,*deze;
int n;
n=0;
deze=eerste_cent;

while (deze!=NULL)
  {
  if (!strcmp(deze->concept,s)) 
            {
             deze->avg_wt+=f;
             deze->hits++;
	     return;
	    }
  n=deze->concept_num+1;
  vorige=deze;
  deze=(void*)deze->volgende;
  }


vorige->volgende=(void*)malloc(sizeof(cent));
deze=(void*)vorige->volgende;
deze->volgende=NULL;
strncpy(deze->concept,s,39);
deze->concept_num=n;
deze->avg_wt=f;
deze->hits=1;
words++;
return;
}

/* ---------------------------------------------- */
/* haalt de entries van een enkel document en voegt
   de gewichten toe aan veld f2 van de centroide
*/
int get_document_vector(char *vorige,int doc)
{
int w,n,r;
cent *deze;
w=0;

if (vorige[0]==0) { r=scanfile(indexfile,s1,s2,fl);f=*fl;}

do
   {
   deze=eerste_cent;
   while (deze!=NULL) 
     {
     if (!strcmp(deze->concept,s2)) 
                {
                matrix[deze->concept_num][doc] = f; 
                break;
                }
     deze=(void*)deze->volgende;
     }
   strcpy(vorige,s1);
   strcpy(filenaam,s1);
   if (scanfile(indexfile,s1,s2,fl)==EOF) {vorige[0]=0;return 0;}
   w++;
   f=*fl;
   }
while (!strcmp(s1,vorige));

return(w);

}
/* ---------------------------------------------- */
float cosine(float l1,float l2,float common)
{
float temp;
if ((l1==0) || (l2==0)) return(0.0);

temp=sqrt(l1*l2);
return(common/temp);
}

/* ------------------------------------------------ */
float angle(float l1,float l2,float common)

{
float temp1, temp2;
double temp3, temp4;
if (l1 == 0 || l2 == 0)
  temp3 = 0.0;
else 
temp3 = common/sqrt(l1 * l2);
temp4 = acos(temp3);
temp2 = temp4;
return(temp2);
}

/***************************************************************************

     euclid(l1,l2,common)

     Returns:  float

     Purpose:  Returns Euclidean distance between two documents           

**/

float euclid(float l1,float l2,float common)

{

 /* use the law of cosines */
  return(sqrt( l1 + l2 - (2 * common)));

}


/* ---------------------------------------------- */
float similarity(float l1,float l2,float common)
{
float temp;

switch(proximity) {
              case COSINE    : return(cosine(l1,l2,common));
         
              case ANGLE     : return(angle(l1,l2,common));

              case EUCLIDEAN : return(euclid(l1,l2,common));

              default        : return(cosine(l1,l2,common));
              } 
}
/* ---------------------------------------------- */
/*


------------------------------------------------- */

/* -------------------------------------------------------- */

/* 
we want to compute the density leaving one single keyword vector out.
That keyword vector is 'concept_no'. So first we compute the vectors from
one to concept_no and then from concept_no+1 to number of words. 
I guess this calls for a separate function, but that's a todo...
*/

void get_doc_data(int doc,float *l1,float *l2,float *common,char *s)
{
int n;
int concept_no;
cent *deze;
float tempfloat,tempfloat2,count1,count2,com;
deze=eerste_cent;

/* zoek kolom met juiste concept */

concept_no=0;
count1=count2=com=0.0;

if (strcmp(s,"-")) 
   {
   do 
      {
      if (!strcmp(s,deze->concept)) {concept_no=deze->concept_num;break;}
      deze=(void*)deze->volgende;
      }
   while (deze!=NULL); 

   for (n=1;n<concept_no;n++)          /* compute sum of squares     */
       {
       if (matrix[n][doc]>0)           /* if there is a word-weight  */
          {
          tempfloat=matrix[n][doc];   
          tempfloat*=tempfloat;
          count1+=tempfloat;
          }                           /* now we have sum-of-squares */

       tempfloat = matrix[n][doc];
       tempfloat2= centroid_array[n];     /* avg. weight / #-of-docs    */
                           
       com += (tempfloat * tempfloat2);  

       tempfloat = centroid_array[n];
       tempfloat*= tempfloat;
       count2+=tempfloat;      
       }
   }

for (n=concept_no+1;n<words;n++)
    {
    if (matrix[n][doc]>0)           /* if there is a word-weight */
        {
        tempfloat=matrix[n][doc];
        tempfloat*=tempfloat;
        count1+=tempfloat;
        }
    tempfloat = matrix[n][doc];
    tempfloat2= centroid_array[n];
    com += (tempfloat * tempfloat2);  

    tempfloat = centroid_array[n];
    tempfloat*= tempfloat;
    count2+=tempfloat;      
   }

*l1=count1; *l2=count2; *common=com;


}
/* ---------------------------------------------- */
/*
float av_doc_similarity(char *s)

compute the average of all doc-doc similarities, using
as similarity measure

 */

float av_doc_similarity(char *s)
{
float total_sim,dl1,dl2,common,temp;
int doc;

total_sim=0.0;

for (doc=0;doc<number_of_docs;doc++)
    {
    get_doc_data(doc,&dl1,&dl2,&common,s); 
             /* collect sum-of-squares of every keyword vector etc */ 
    temp = similarity(dl1,dl2,common);
    total_sim+=temp;
    }

temp = total_sim/(float)(number_of_docs);
return(temp);
}



/* ---------------------------------------------------------------- */
void get_arguments(int argc,char **argv)
{
char *s;
char o,n;
extern int optind;
extern char *optarg;

proximity=0;
filetype=-1;

while ((o=getopt(argc,argv,"Acdhf:m:"))!=-1)
     switch (o)
      { 

     case 'A':  printf("%c ",comment);
                for (n=0;n<argc;n++) printf("%s ",argv[n]);
                printf("\n");
                break;

     case 'h':  help();

     case 'd':  debug=1;break;
     case 'f':  filetype=atoi(optarg); break;
     case 'c':  print_only_centroid=1;break; 
     case 'w':  kaal=1;break;
     case 'm':  proximity=atoi(optarg); break;
      }

if (optind<argc) strcpy(naam,argv[optind++]);


if (debug)
  {
  printf("naam      %s\n",naam);
  printf("print_only_centroid %d\n",print_only_centroid);
  printf("kaal      %d\n",kaal);
  printf("simil.c   %d\n",proximity);
  }
}

/* ---------------------------------------------- */
int main(int argc,char **argv)
{
char s3[20],argument[20],regel[1000];
int n,m,number_of_words,teller=0;
int doc;
float dv,density,sum[10000],f1,f2;
cent *deze;

comment=COMMENT;
print_only_centroid=0; 
kaal=0;
proximity=0;
debug=0;


if (argc<2) help();

get_arguments(argc,argv);

s1=(char*)malloc(50);
s2=(char*)malloc(50);
fl =(float*)malloc(sizeof(float));

words=vorige[0]=number_of_docs=0;
eerste_cent=(void*)malloc(sizeof(cent));
eerste_cent->volgende=NULL;

if ((indexfile=fopen(naam,"r"))==NULL) error_exit ("File not found");

/* find out what order the data are in */
if (filetype==-1)
   {
   do
      fgets(regel,999,indexfile);
   while (((regel[0]==comment) || (strlen(regel)<3)) && (!feof(indexfile)));

   sscanf(regel,"%s %s %s",s1,s2,s3);

   if (debug) 
           {
           printf("--->%s %s %s<---\n",s1,s2,s3);
           printf("%f %f\n",atof(s2),atof(s3));
           }

   if ((atof(s2)!=0) && (atof(s3)==.0)) filetype=0; else filetype=1;

   fseek(indexfile,0,SEEK_SET);
   }

while (!feof(indexfile))
  {
  if (scanfile(indexfile,s1,s2,fl)!=EOF)
     {
      f=*fl;
      if (strcmp(vorige,s1)) number_of_docs++;
      strcpy(vorige,s1);
      add_to_centroid(s2,f);
      teller++;
     }
  }

words++; 

if ((centroid_array=(float**)malloc((words+1)*sizeof(centroid_array)))==NULL) 
                                                       error_exit("malloc 1");
n=0;

deze=eerste_cent;


while (deze!=NULL)
     {
     deze->avg_wt /= number_of_docs;
     centroid_array[n]=deze->avg_wt;
     if (print_only_centroid) 
                {
                if (debug) printf ("%4d\t",deze->concept_num);
                printf("%s\t%f\t%s\n", 
			   naam,centroid_array[n],  deze->concept);
                }

     woorden[n]=(char*)malloc(strlen(deze->concept)+1);
     strcpy(woorden[n],deze->concept);
     n++;
     deze=(void*)deze->volgende;
     }


if (!kaal) printf("%c words in centroid: %d, docs: %d\n",comment,words-1,number_of_docs);

if (print_only_centroid) exit(0);


/*--------------------------------------------------------------------*/
/**** now we have the centroid ready and we process the file again ****/
/**** to get the document-vectors */

/* first make matrix on size */

if ((matrix=(float**)malloc((words+1)*sizeof(matrix)))==NULL) 
                                                       error_exit("malloc 1");

for (n=0;n<words+1;n++) 
    {
    if ((matrix[n]=(float*)malloc((number_of_docs+2)*sizeof(float)))==NULL) 
                                                       error_exit("malloc 2");
    memset(matrix[n],0,(number_of_docs+1)); 
    }


/* ------------------------------------------------------------------- */

fseek(indexfile,0,SEEK_SET);
vorige[0]=0;
doc=0;

/*   read every word-concept combination and store weight in matrix   */

do
  {
  sum[doc]=0.0;
  number_of_words=get_document_vector(vorige,doc);
  doc++;
  }
while (vorige[0]!=0);

/**********************************************************************/
/** the density of the collection is the average of the similarities **/

density = av_doc_similarity("-");
if (!kaal) printf("%c baseline %f\n",comment,density);

for (n=1;n<words;n++) 
    {
    if (proximity==0) dv=av_doc_similarity(woorden[n]) - density;
                else  dv=density - av_doc_similarity(woorden[n]);
    printf("%s%s%f\n",woorden[n],FIELDSEP,dv);
    }
/**********************************************************************/

fclose(indexfile);
}









