/*

Description

A query is compared with a database. 

Some tuples in the database are returned as possible relevant records,
here called 'the resultsfile'.

A number of these tuples is selected as 'good' answers on that query,
the rest is considered 'bad'.

Using Rocchio's algorithm the weights in the query vector are adjusted
towards the selected document vectors.

Input:  query, database
Output: adjusted query

==========================================

Usage: roccio [options] [-a#][-b#] [-c#] [-f set_of_examples] [queryfile] resultfile 

If no queryfile is given, the first vector in the
resultfile is considered to be the query.

Every tuple of course is a document vector.

The set_of_examples is a list of integers that point to 
those documents in the database that act as 'good' examples.

Alternatively the last value in each document vector may act as
indicator of good or bad examples

*/

#include <math.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <ctype.h>
#include <unistd.h>
#include "utils.h"

#define BUF_SIZE 64000
#define VECTOR_SIZE 32000

typedef struct
   {
   char concept[WORD_LENGTH];
   float weight[VECTOR_SIZE];
   }word_vector;

char   regel[1024];
char   qnaam[WORD_LENGTH],dnaam[WORD_LENGTH];
int    vector_length, no_query_file, last_score;
int    num_examples,examples[100];
float sum_rel,sum_irrel,alpha,beta,gama;


/* ---------------------------------------------------------------- */
void help()
{
printf("Usage: roccio [options]  [-a#][-b#] [-c#] [-f set_of_examples] [queryfile] resultfile\n");
printf("   -a<real> : alpha (original weight)\n");
printf("   -b<real> : beta  (relevant weights)\n");
printf("   -c<real> : gama (irrelevant weights)\n");
printf("   -f file  : numbers of relevant records\n");
printf("Copyright Hans Paijmans 1995,1996\n");
printf("version 1.0,  sept. 1996\n");
exit(1);
} 


/* ---------------------------------------------------------------- */

/* ---------------------------------------------------------------- */
void get_arguments(int argc,char **argv)
{
char *s,o;
int x;
char sentnaam[40];
FILE *fuit;
extern int optind;
extern char *optarg;

num_examples=0;
alpha=beta=gama=1;

if ((argc<2) || (!strcmp(argv[1],"-h"))) help();

while ((o=getopt(argc,argv,"hdva:b:c:f:"))!=-1)
     switch (o)
       {
       case 'h': help();break;
       case 'd': debug=1; break;
       case 'v': verbose=1; break;
       case 'a': alpha=atof(optarg);break;
       case 'b': beta=atof(optarg);break;
       case 'c': gama=atof(optarg);break;

       case 'f': 
                 strcpy(sentnaam,optarg);
                 if ((fuit=fopen(sentnaam,"r"))==NULL) 
                          error_exit("no judgements-file: ",sentnaam);
                 while (!feof(fuit)) 
                          fscanf(fuit,"%d ",&examples[num_examples++]);
                 fclose(fuit);
                 if (debug)
                      for (x=0;x<num_examples;x++) printf("%d ",examples[x]);
                 break;
       }
/* --- */

dnaam[0]=0;

if (optind<argc) strcpy(qnaam,argv[optind++]);
if (optind<argc) strcpy(dnaam,argv[optind++]);

no_query_file=1;
last_score=1;

if (debug) 
    {
    check_all();
    printf("last_score         %d\n",last_score);
    printf("no_query_file      %d\n",no_query_file);
    }
}
/* ---------------------------------------------------------------- */
word_vector *read_vector(FILE *f)
{
char *concept;
char buffer[BUF_SIZE];
int x;
word_vector *vec;

do
 {
 fgets(buffer,99,f);
 if (feof(f)) return NULL;
 }
while ((buffer[0]==comment) || (strlen(buffer)<3));

vec=(word_vector*)malloc(sizeof(word_vector));
if (vec==NULL) error_exit("Memory problem","");

x=0;
concept=strtok(buffer," \t");
strcpy(vec->concept,concept);
while (concept)
  {
  concept=strtok(NULL," \t");
  if (concept) vec->weight[x++]=atof(concept);
  }
vector_length=x;
return (vec);
}
/* ---------------------------------------------------------------- */
main(int argc,char **argv)
{
int x,y,rel,irrel,score;
int doc_aantal;
FILE *dfile;
word_vector *query,*new_query,*vec,*doc_vec[10000];

initialize(argv[0]);
get_arguments(argc,argv);

/* read query from database */

if ((dfile=fopen(qnaam,"r"))==NULL) error_exit ("Q-file not found: ",qnaam);

if (no_query_file) query=read_vector(dfile);


if (dnaam[0]) /*if separate database */
   {
   fclose(dfile);
   if ((dfile=fopen(qnaam,"r"))==NULL) 
                      error_exit ("Datafile not found: ",dnaam);
   }

/* -------------------- read the documentvectors ---------------------- */
doc_aantal=1;
do
   {
   vec=read_vector(dfile);
   if (vec!=NULL) doc_vec[doc_aantal++]=vec;
   }
while (vec!=NULL);

if (debug) printf("num_examp  %d\ndoc_aantal %d\n",num_examples,doc_aantal);

/* ------------------- add the relevance judgements ------------------- */

if (num_examples)  /* als we een serie aparte judgements hebben */
   {               /* voegen we ze achter de records */
   for (y=1;y<doc_aantal;y++) doc_vec[y]->weight[vector_length]=0;
   for (y=0;y<num_examples;y++) doc_vec[examples[y]]->weight[vector_length]=1;
   } else vector_length--;

/* ------------------------------------------------------------------- */
/*       vector_length now always includes the relevance judgment      */

/* ------------------- debug information -------------------- */
if (verbose) 
  {

  for (y=1;y<doc_aantal;y++)
     {
     printf("%10s ",doc_vec[y]->concept);
     for (x=0;x<vector_length;x++) printf("%5.2f ",doc_vec[y]->weight[x]);
     if (doc_vec[y]->weight[vector_length]) printf("+"); else printf("-");
     printf("\n");
     }  
  printf ("----------\n");
  printf("%10s ",query->concept);
  for (x=0;x<vector_length;x++) printf("%5.2f ",query->weight[x]);
  printf("\n");

  }
/* ------------------------------------------------------------ */
new_query=(word_vector*)malloc(sizeof(word_vector));

for (y=0;y<vector_length;y++)
  {
  rel=irrel=0;
  sum_rel=sum_irrel=0;
  for (x=1;x<doc_aantal;x++)
        {
        if (doc_vec[x]->weight[vector_length])
	  {sum_rel+=doc_vec[x]->weight[y];rel++;}
	else {sum_irrel+=doc_vec[x]->weight[y];irrel++;}
        }
  sum_rel/=rel;
  sum_irrel/=irrel;
  if (debug) printf("%f - %f\n",sum_rel,sum_irrel);
  new_query->weight[y]=
           (alpha * query->weight[y]) + (beta * sum_rel) - (gama * sum_irrel);
  }

printf("new_query  ");
for (x=0;x<vector_length;x++) printf("%5.2f ",new_query->weight[x]);
printf("\n");
exit(0);
}
























