/*********************************************************************/
/*        LINEAR TREE for Supervised Learning                        */
/*        Versao 1.0 (10/12/1997)                                    */
/*        Developed by: Joao Gama                                    */
/*                LIACC - Uni.do Porto                               */
/*                jgama@ncc.up.pt                                    */
/*-------------------------------------------------------------------*/
/*   File: Entropia.c                                                */
/*********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "Ci_instances.h"
#include "entropia.h"
#include "externs.i"

#define LOG2    0.6931471806
#define EPSILON 0.0000001
#define MYLOG(x) ((x) == 0.0 ? 0.0 : log(x) / LOG2)

static double Hy = 0.0;
/****************************************/
/*   Prototyps for Private FUNCTIONS    */
/****************************************/
static EntAtt *MakeEntAtt();
static void InitEntAtt(EntAtt *entatt,  int k);
static double entropia_atributo_discreto(CiDs *ds, double ***STATIS, int k, double nr_exs, double *Hx);
static double entropia_atributo_continuo(CiDs *ds, double *class_freq, long int Low, long int High, int k, double *Hx, double *split);

static void best_attribute(EntAtt *Best, int k, double Gain, double GainRatio, double split, int Flag);
static void set_best_attribute(EntAtt *Best, int k, double Gain, double GainRatio, double split);
/****************************************/
/*   Public FUNCTIONS                   */
/****************************************/
EntAtt *entropia\
(CiDs *ds, double ***STATIS, double *class_freq, int nr_att, long int Low, long int High, double *Mean)
{
  register int i, Att, nr = 0;
  double p, Hx, H, split, Gain_Mean = 0.0;
  EntAtt *Best = MakeEntAtt(); 
 
  Hy = 0.0;
  for(i = 1; i <= Ci_NrClasses(CI_Domain(ds)); i++) 
    if (class_freq[i]) {
      p = (double) class_freq[i] / class_freq[0];
      Hy -= p * MYLOG(p);
    }
  
  for(Att = 1; Att <= nr_att; Att++) {
    H = Hx = split = 0.0;
    switch(CiTypeAttr(ds->domain, Att)) {
    case integer:
    case ordered:
    case nominal:
      if ((H = entropia_atributo_discreto(ds, STATIS, Att, class_freq[0], &Hx)) != -1) {
	best_attribute(Best, Att, H, Hx, 0.0, GAIN_RATIO);
	Gain_Mean = (Gain_Mean * nr + H) / ++nr;
      }
      break;
    case continuous:
      CiQuickSort(ds, Att, Low, High);
      if ((H = entropia_atributo_continuo(ds, class_freq, Low, High, Att, &Hx, &split)) != -1) {
	best_attribute(Best, Att, H, Hx, split, GAIN_RATIO);
	Gain_Mean = (Gain_Mean * nr + H) / ++nr;
      }
    }
  }
  *Mean = Gain_Mean;
  return Best;
}
/****************************************/
/*   Private FUNCTIONS                  */
/****************************************/
static void InitEntAtt(EntAtt *entatt,  int k)
{
  if (entatt) {
    entatt->attribute = k;
    entatt->cut_point = 0.0;
    entatt->gain = 0.0;
    entatt->info = 1.0;
  }
}
static EntAtt *MakeEntAtt()
{
  EntAtt *entatt = (EntAtt *) malloc(sizeof(EntAtt));
  if (entatt) {
    entatt->attribute = -1;
    entatt->cut_point = 0.0;
    entatt->gain = 0.0;
    entatt->info = 1.0;
  }
  return entatt;
}

static void best_attribute\
(EntAtt *Best, int k, double Gain, double Info, double split, int Flag_GAIN)
{
  if (Best->attribute == -1) 
    set_best_attribute(Best, k, Gain, Info, split);
  else 
    if (Flag_GAIN) {
      if (Gain >= 0.0 || Info > EPSILON)
	if (Gain / Info > Best->gain / Best->info) 
	  set_best_attribute(Best, k, Gain, Info, split);
    }
    else {
      if (Gain > Best->gain)
	set_best_attribute(Best, k, Gain, Info, split);
    }
}

static void set_best_attribute\
(EntAtt *Best, int k, double Gain, double Info, double split)
{
  Best->attribute = k;
  Best->cut_point = split;
  Best->gain = Gain;
  Best->info = Info ? Info : EPSILON;
}

static double entropia_atributo_discreto\
(CiDs *ds, double ***STATIS, int Att, double nr_exs, double *Hx)
{
  register int i, j, nr_splits = 0;
  double p, H = 0.0;

  *Hx = 0.0;
  for(i = 1; i <= NValsAttr(ds->domain, Att); i++) {
    if(STATIS[Att][0][i]) {
      p = (double) STATIS[Att][0][i] / ((double) nr_exs - STATIS[Att][0][0]);
      *Hx -= p * MYLOG(p);
      ++nr_splits;
    }
    
    for(j = 1; j <= Ci_NrClasses(ds->domain); j++)
      if(STATIS[Att][j][i]) {
        p = (double) STATIS[Att][j][i] / (double) STATIS[Att][0][i];
        H -= (STATIS[Att][0][i] / ((double) nr_exs - STATIS[Att][0][0])) * p * MYLOG(p);
      }
  }
  p = Hy - H;
  
  /*    Correction due to unknown values  */
  if (STATIS[Att][0][0]) {
    p = (double) (nr_exs - STATIS[Att][0][0]) / (double) nr_exs;
    p = Hy - H + p * MYLOG(p);
  }
  if (nr_splits > 1) return p;
  return -1.0;
}

static double entropia_atributo_continuo\
(CiDs *ds, double *class_freq, long int Low, long int High, int Att, double *Hx, double *split)
{
  register int i, j, z, classe, change_class, nr_unk = 0, nr_splits = 0;
  double minsplit;
  double p, p1, H, split_value, mean = 0.0;
  double inf = 0.0;
  EntAtt entatt;
  double nr_exs = class_freq[0], COUNTER_CLASS[Ci_NrClasses(ds->domain) + 1];

  for(z = 0; z <= Ci_NrClasses(ds->domain); z++) COUNTER_CLASS[z] = 0;
  InitEntAtt(&entatt, Att);

  if (nr_exs <= 2 * MINIMUM_EXAMPLES)
    return -1;

  i = Low;
  minsplit = 0.10 * class_freq[0] / (Ci_NrClasses(ds->domain) +1);
  if (minsplit < MINIMUM_EXAMPLES) minsplit = MINIMUM_EXAMPLES;
  if (minsplit > 25.0) minsplit = 25.0;
    
  /* Skip Unknowns */
  while (i < High && TypeOfVal(Ci_AttVal(ds, i)[Att]) != normal) {
    nr_unk += Ci_Weight(Ci_Example(ds, i));
    ++i;
  }

  ++i;
    
  for(; i < High; i++) {
    --i;
    inf = 0.0;
    change_class = FALSE;
     
    classe = Ci_Classe(Ci_Example(ds, i));
    COUNTER_CLASS[0] += Ci_Weight(Ci_Example(ds, i));
    COUNTER_CLASS[classe] += Ci_Weight(Ci_Example(ds, i));
    ++i;

    /* Find Boundaries Cut Points */
    while (i < High && COUNTER_CLASS[0] < minsplit) {
      COUNTER_CLASS[0] += Ci_Weight(Ci_Example(ds, i));
      COUNTER_CLASS[Ci_Classe(Ci_Example(ds, i))] += Ci_Weight(Ci_Example(ds, i));
      ++i;
      nr_splits = 1;
    }
    split_value = (CValAttEx(Ci_AttVal(ds, i),Att) + CValAttEx(Ci_AttVal(ds, i-1),Att)) / 2.0;

    while (i < High && split_value >= CValAttEx(Ci_AttVal(ds, i),Att)) {
      COUNTER_CLASS[0] += Ci_Weight(Ci_Example(ds, i));
      if (classe != Ci_Classe(Ci_Example(ds, i))) 
	change_class = TRUE;
      COUNTER_CLASS[Ci_Classe(Ci_Example(ds, i))] += Ci_Weight(Ci_Example(ds, i));
      ++i;
    }

    ++nr_splits; 

    split_value = (CValAttEx(Ci_AttVal(ds, i-1),Att) + CValAttEx(Ci_AttVal(ds, i),Att) ) / 2.0;
    classe = Ci_Classe(Ci_Example(ds, i));

    if (COUNTER_CLASS[0] >= minsplit && nr_exs - COUNTER_CLASS[0] >= minsplit) {
      p = (double) COUNTER_CLASS[0] / (double) nr_exs; 
      p1 = (double)(nr_exs - COUNTER_CLASS[0]) / (double) nr_exs;
      inf -=  p * MYLOG(p) + p1 * MYLOG(p1);
      
      H = 0.0;
      for(j = 1; j <= Ci_NrClasses(ds->domain); j++) {
	if (COUNTER_CLASS[j]) {
	  p = (double) COUNTER_CLASS[j] / (double) COUNTER_CLASS[0];
	  H -= (COUNTER_CLASS[0] / (double) nr_exs) * p * MYLOG(p);
	}
	if (class_freq[j] - COUNTER_CLASS[j]){
	  p1 = (double) (class_freq[j] - COUNTER_CLASS[j]) / (double) (nr_exs - COUNTER_CLASS[0]);
	  H -= ((nr_exs - COUNTER_CLASS[0]) / (double) nr_exs) * p1 * MYLOG(p1);
	}
      }
      
      mean += (Hy - H);
      best_attribute(&entatt, Att, Hy-H, inf, split_value, FALSE);
    }
    else
      --nr_splits;
  }

  *split = entatt.cut_point;
  *Hx = entatt.info;
  mean = (nr_splits ? mean / nr_splits : 1E-6);
  if (nr_unk) {    
    p = (double) (nr_exs - nr_unk)  / (double) nr_exs;
    entatt.gain += p * MYLOG(p);
  }

  if (nr_splits <= 0 || 
      (entatt.gain  <= MYLOG((double) nr_splits) / (double) nr_exs - 1E-3) ||  mean < 1E-5) {
    return -1.0;
  }
  return(entatt.gain - MYLOG((double) nr_splits) / nr_exs);
}






