/*

Description

A query is compared with a database. The tuples in the database are
sorted on similarity with a query. A number of tuples is selected
as 'good' answers on that query.
Using Rocchio's algorithm the weights in the query vector are adjusted
towards the selected document vectors.

Input:  query, database
Output: adjusted query

==========================================

Usage: learn [options] [-f set_of_examples] [queryfile] databasefile 

If no queryfile is given, the first vector in the
databasefile is considered to be the query.

Every tuple of course is a document vector.

The set_of_examples is a list of integers that point to 
those documents in the database that act as 'good' examples.

Alternatively the last value in each document vector may act as
indicator of good or bad examples

*/

#include <math.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <ctype.h>
#include <unistd.h>
#include "utils.h"

#define BUF_SIZE 64000
#define VECTOR_SIZE 32000

typedef struct
   {
   char concept[WORD_LENGTH];
   float weight[VECTOR_SIZE];
   }word_vector;

char   regel[1024];
char   qnaam[WORD_LENGTH],dnaam[WORD_LENGTH];
int    vector_length;
int    num_examples,examples[100];
float sum_rel,sum_irrel,alpha,beta,gamma;


/* ---------------------------------------------------------------- */
void help()
{
printf("Usage: ...\n");

printf("Copyright Hans Paijmans 1995,1996");
exit(1);
} 


/* ---------------------------------------------------------------- */

/* ---------------------------------------------------------------- */
void get_arguments(int argc,char **argv)
{
char *s,o;
char sentnaam[40];
FILE *fuit;
extern int optind;
extern char *optarg;

num_examples=0;
alpha=beta=gamma=1;

if ((argc<2) || (!strcmp(argv[1],"-h"))) help();

while ((o=getopt(argc,argv,"hdvf:"))!=-1)
     switch (o)
       {
       case 'h': help();break;
       case 'd': debug=1; break;
       case 'v': verbose=1; break;
       case 'f': 
                 strcpy(sentnaam,optarg);
                 if ((fuit=fopen(sentnaam,"r"))==NULL) 
                                error_exit("no out-file %s\n",sentnaam);
                 while (!feof(fuit)) fscanf(fuit,"%d ",&examples[num_examples++]);
                 fclose(fuit);
                 break;
       }
/* --- */

dnaam[0]=0;
if (optind<argc) strcpy(qnaam,argv[optind++]);
if (optind<argc) strcpy(dnaam,argv[optind++]);


if (debug) check_all();

}


/* ---------------------------------------------------------------- */

/* ---------------------------------------------------------------- */
word_vector *read_vector(FILE *f)
{
char *concept;
char buffer[BUF_SIZE];
int x;
word_vector *vec;


if (fgets(buffer,99,f)==NULL) return(NULL);

vec=(word_vector*)malloc(sizeof(word_vector));
if (vec==NULL) error_exit("Memory problem");

x=0;
concept=strtok(buffer," \t");
strcpy(vec->concept,concept);
while (concept)
  {
  concept=strtok(NULL," \t");
  if (concept) vec->weight[x++]=atof(concept);
  }
vector_length=x;
return (vec);
}
/* ---------------------------------------------------------------- */
main(int argc,char **argv)
{
int x,y,rel,irrel;
int doc_aantal;
FILE *dfile;
word_vector *query,*new_query,*vec,*doc_vec[10000];

initialize();
get_arguments(argc,argv);

/* read query from database */

if ((dfile=fopen(qnaam,"r"))==NULL) error_exit ("Q-file %s not found",qnaam);
query=read_vector(dfile);

if (verbose)
  {
  printf("%s ",query->concept);
  for (x=0;x<vector_length;x++) printf("%f ",query->weight[x]);
  printf("\n");
  printf ("-------------------------------------\n");
  }

if (dnaam[0]) /*if separate database */
   {
   fclose(dfile);
   if ((dfile=fopen(qnaam,"r"))==NULL) error_exit ("D-file %s not found",dnaam);
   }

/* -------------------- read the documentvectors ---------------------- */
doc_aantal=0;
do
   {
   vec=read_vector(dfile);
   if (vec!=NULL)
     {
       /*
     doc_vec[doc_aantal]=(word_vector*)malloc(sizeof(word_vector));
     if (doc_vec[doc_aantal]==NULL) error_exit("Memory problem");
     */
     doc_vec[doc_aantal]=vec;
     doc_aantal++;
     }
   }
while (vec!=NULL);

/* ------------------- add the relevance judgements ------------------- */

if (num_examples)
   {
   for (y=0;y<doc_aantal;y++)
     {
     doc_vec[y]->weight[vector_length]=0;
     }
   for (y=0;y<num_examples;y++) doc_vec[examples[y]]->weight[vector_length]=1;
   } else vector_length--;

/* ------------------- debug information -------------------- */
if (verbose) 
   for (y=0;y<doc_aantal;y++)
     {
     printf("%s ",doc_vec[y]->concept);
     for (x=0;x<vector_length+1;x++) printf("%f ",doc_vec[y]->weight[x]);
     printf("\n");
     }

/* ------------------------------------------------------------ */
new_query=(word_vector*)malloc(sizeof(word_vector));

for (y=0;y<vector_length;y++)
  {
  rel=irrel=0;
  sum_rel=sum_irrel=0;
  for (x=0;x<doc_aantal;x++)
        {
        if (doc_vec[x]->weight[vector_length])
	  {sum_rel+=doc_vec[x]->weight[y];rel++;}
	else {sum_irrel+=doc_vec[x]->weight[y];irrel++;}
        }
  sum_rel/=rel;
  sum_irrel/=irrel;
  printf("%f - %f\n",sum_rel,sum_irrel);
  new_query->weight[y]=
           (alpha * query->weight[y]) + (beta * sum_rel) - (gamma * sum_irrel);
  }

for (x=0;x<vector_length;x++) printf("%f ",new_query->weight[x]);

exit(0);
}












