/*
 *   Copyright (C) 2000 Loic Dachary
 *
 *   This program is free software; you can redistribute it and/or modify it
 *   under the terms of the GNU General Public License as published by the
 *   Free Software Foundation; either version 2, or (at your option) any
 *   later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program; if not, write to the Free Software
 *   Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 
 *
 */

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif /* HAVE_CONFIG_H */

#ifdef HAVE_MIFLUZ_H

#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <stdlib.h>
#include <unistd.h>
#include <mifluz.h>

#include <strlower.h>
#include <isomap.h>
#include <getopttools.h>
#include <salloc.h>
#include <file_exists.h>
#include <html_content.h>
#include <sqlutil.h>

#include <webbase_hook.h>

static int hook_init(webbase_hook_params_t* params, webbase_t* base);
static int hook_ok(webbase_hook_params_t* params, webbase_url_t* webbase_url);
static int hook_insert(webbase_hook_params_t* params, webbase_url_t* webbase_url);
static int hook_content_collect(int info, html_content_parsed_t* parsed, void* data);
static int hook_handle_content(webbase_hook_params_t* params, const char* buffer, int buffer_length);
static int hook_insert_watch(webbase_hook_params_t* params);
static int hook_update(webbase_hook_params_t* params, webbase_url_t* webbase_url);
static int hook_prepare(webbase_hook_params_t* params, int id, char* url, int url_length);
static int hook_delete_id(webbase_hook_params_t* params, int id, char* url, int url_length);
static int hook_rebuild_init(webbase_hook_params_t* params, const char* where);
static int hook_rebuild_start(webbase_hook_params_t* params);
static int hook_rebuild_end(webbase_hook_params_t* params);
static int hook_info_set(webbase_hook_params_t* params, const char* info);
static int hook_server(webbase_hook_params_t* params, int server);
static int hook_end(webbase_hook_params_t* params);
static int hook_close(webbase_hook_params_t* params);
static int hook_open(webbase_hook_params_t* params, int flags);
static int hook_getopt(webbase_hook_params_t* params, int argc, char** argv);

/*
 * Shared lib entry point must be plain C, not C++ mangled.
 */
extern "C" {
  void* hooksmifluz_init();
}

#define POS_Word     0
#define POS_Server   1
#define POS_URL      2
#define POS_Location 3 

#define ACTION_INSERT 1
#define ACTION_DELETE 2

typedef struct webbase_hook_mifluz_param {
  WordContext* context;
  WordReference* wordRef;
  WordList* inv;
  String index_dir;
  String index_base;
  String index_config;
  int action;
  char* content;
  int content_length;
  int content_size;
} webbase_hook_mifluz_params_t;

static struct option long_options[] =
{
    {"verbose_hooks", 0, 0, 0},
    {"index_file", 1, 0, 0},
    {"index_config", 1, 0, 0},
    {0, 0, 0, 0}
};

#if 0
static struct option_help long_options_help[] =
{
  {"verbose_hooks", "increase verbosity level"},
  {"index_file <path>", "override mifluz index file name"},
  {"index_config <path>", "override mifluz configuration file name"},
  {"", ""}
};
#endif

static int hook_init(webbase_hook_params_t* params, webbase_t* base)
{
  params->base = base;
  return hook_open(params, O_RDWR);
}

static int hook_ok(webbase_hook_params_t* params, webbase_url_t* webbase_url)
{
  if((webbase_url->w_info & WEBBASE_URL_INFO_CONTENT) &&
     webbase_url->w_content_length > 0 &&
     !webbase_url_robots_p(webbase_url))
    return 1;
  else
    return 0;
}

static int hook_insert(webbase_hook_params_t* params, webbase_url_t* webbase_url)
{
  webbase_hook_mifluz_params_t* param_mifluz = (webbase_hook_mifluz_params_t*)params->data;
  if(params->verbose) fprintf(stderr, "indexing %s\n", webbase_url->w_url);
  WordKey& key = param_mifluz->wordRef->Key();
  /*
   * Prepare the insertion context
   */
  param_mifluz->wordRef->Clear();
  key.Set(POS_URL,webbase_url->w_rowid);
  key.Set(POS_Server,params->server_id);
  key.Set(POS_Location, WORD_KEY_VALUE_INVALID);
  
  /*
   * Invoke the HTML parser
   */
  html_content_t walk;

  html_content_reset(&walk);
#ifdef WEBBASE_CONTENT_BASE
  walk.parser.info = HTML_SOURCE_STRING;
  walk.parser.source = webbase_url->content;
  walk.parser.source_length = webbase_url->content_length;
#else /* WEBBASE_CONTENT_BASE */
  walk.parser.info = HTML_SOURCE_FILENAME;
  char* path = webbase_url_file(webbase_url->w_rowid);
  walk.parser.source = path;
  walk.parser.source_length = strlen(path);
#endif /* WEBBASE_CONTENT_BASE */
  walk.content_callback = hook_content_collect;
  walk.content_data = (void*)params;

  param_mifluz->action = ACTION_INSERT;

  int ret = html_content_parse(&walk);

  webbase_url->w_hookid = webbase_url->w_rowid;

  return ret;
}

static int hook_content_collect(int info, html_content_parsed_t* parsed, void* data)
{
  webbase_hook_params_t* params = (webbase_hook_params_t*)data;
  //  webbase_hook_mifluz_params_t* params_mifluz = (webbase_hook_mifluz_params_t*)params->data;

  int ret = 1;
  unsigned char* buffer = parsed->buffer0;
  int buffer_length = parsed->buffer0_length;
  //  WordKey& key = params_mifluz->wordRef->Key();

  switch(info) {

  case HTML_CONTENT_META:
    {
      buffer = parsed->buffer1;
      buffer_length = parsed->buffer1_length;

      if(!strncasecmp((const char*)parsed->buffer0, "keyword", 7)) {
	// key.Set(POS_Tag,WEBBASE_URL_TAG_KEY);
      } else if(!strncasecmp((const char*)parsed->buffer0, "description", 11)) {
	// key.Set(POS_Tag,WEBBASE_URL_TAG_DESCRIPTION);
      } else {
	buffer = 0;
	buffer_length = 0;
      }
    }
    break;

  case HTML_CONTENT_TITLE:
    //key.Set(POS_Tag, WEBBASE_URL_TAG_TITLE);
    break;

  case HTML_CONTENT_TEXT:
    //key.Set(POS_Tag, WEBBASE_URL_TAG_BODY);
    break;
  }

  if(buffer)
    ret = hook_handle_content(params, (const char*)buffer, buffer_length);
  
  return ret;
}

static int hook_handle_content(webbase_hook_params_t* params, const char* buffer, int buffer_length)
{
  webbase_hook_mifluz_params_t* params_mifluz = (webbase_hook_mifluz_params_t*)params->data;
  const WordType& wtype = params_mifluz->context->GetType();
  const unsigned char* p;
  const unsigned char* word_start;
  WordKey& key = params_mifluz->wordRef->Key();
  key.Set(POS_Word, WORD_KEY_VALUE_INVALID);
  unsigned char* buffer_tmp = (unsigned char*)malloc(buffer_length + 1);
  memcpy(buffer_tmp, buffer, buffer_length);
  buffer_tmp[buffer_length] = '\0';
  p = word_start = buffer_tmp;
  
  unaccent(buffer_tmp, buffer_length);
  strlower((char*)buffer_tmp, buffer_length);
  while(p - buffer_tmp < buffer_length) {
    while(word_start - buffer_tmp < buffer_length && !wtype.IsChar(*word_start))
      word_start++;
    p = word_start;
    while(p - buffer_tmp < buffer_length && wtype.IsChar(*p))
      p++;
    if(word_start < p) {
      if(key.Overflow(POS_Location, 1)) {
	fprintf(stderr, "hook_handle_content: overflow location, check wordlist_wordkey_description definition\n");
      } else {
	key[POS_Location]++;
      }
      params_mifluz->wordRef->SetWord(String((char*)word_start, p - word_start));
      switch(params_mifluz->action) {
      case ACTION_INSERT:
	//
	// Check for overflow after it occured, hence the 0 argument of Overflow.
	//
	if(key.Overflow(POS_Word, 0)) {
	  fprintf(stderr, "hook_handle_content: overflow word identifier, check wordlist_wordkey_description definition\n");
	}
	if(params->verbose > 2) fprintf(stderr, "hook_handle_content: %s\n", (char*)params_mifluz->wordRef->Get());
	params_mifluz->inv->Override(*(params_mifluz->wordRef));
	break;
      case ACTION_DELETE:
	{
	  unsigned int wordid = 0;
	  if(params_mifluz->inv->Dict()->SerialExists(String((char*)word_start, p - word_start), wordid) != OK) {
	    fprintf(stderr, "hook_handle_content: cannot find id for %.*s\n", p - word_start, (char*)word_start);
	  } else {
	    params_mifluz->wordRef->Key().Set(WORD_KEY_WORD, wordid);

	    if(params->verbose > 2) fprintf(stderr, "hook_handle_content: %s\n", (char*)params_mifluz->wordRef->Get());
	    params_mifluz->inv->Delete(*(params_mifluz->wordRef));
	    break;
	  }
	}
      default:
	fprintf(stderr, "hook_handle_content: unknown action %d\n", params_mifluz->action);
	break;
      }
    }
    word_start = p;
  }

  free((void *)buffer_tmp);
  return 1;
}

static int hook_insert_watch(webbase_hook_params_t* params)
{
  return 1;
}

static int hook_update(webbase_hook_params_t* params, webbase_url_t* webbase_url)
{
  if(params->verbose) fprintf(stderr, "reindex %s\n", webbase_url->w_url);
  hook_delete_id(params, webbase_url->w_hookid, webbase_url->w_url, strlen(webbase_url->w_url));
  hook_insert(params, webbase_url);
  return 1;
}

static int hook_prepare(webbase_hook_params_t* params, int id, char* url, int url_length)
{
  if(id <= 0) return 1;

  if(params->verbose > 1) fprintf(stderr, "prepare index update %.*s\n", url_length, url);

  char* content = 0;
  int content_length = 0;
  char query[256];
  MYSQL_RES *res;
  MYSQL_ROW row;

#ifdef TABLE_SPLIT
  sprintf(query, "select content from url_content%02d where rowid = %d", (id % TABLE_SPLIT_SIZE), id);
#else /* TABLE_SPLIT */
  sprintf(query, "select content from url_content where rowid = %d", id);
#endif /* TABLE_SPLIT */
  smysql_query(&params->base->mysql, query);
  res = smysql_store_result(&params->base->mysql);

  if(mysql_num_rows(res) > 0) {
    if((row = mysql_fetch_row(res))) {
      unsigned long* lengths = mysql_fetch_lengths(res);
      if(row[0]) {
	content = row[0];
	content_length = lengths[0];
      }
    }
  }

  if(content_length > 0) {
    if(params->verbose > 4) fprintf(stderr, "prepare index memorize %.*s\n", content_length, content);
    webbase_hook_mifluz_params_t* param_mifluz = (webbase_hook_mifluz_params_t*)params->data;
    static_alloc(&param_mifluz->content, &param_mifluz->content_size, content_length + 1);
    memcpy(param_mifluz->content, content, content_length);
    param_mifluz->content[content_length] = '\0';
    param_mifluz->content_length = content_length;
  }

  mysql_free_result(res);

  return 1;
}

static int hook_delete_id(webbase_hook_params_t* params, int id, char* url, int url_length)
{
  if(params->verbose) fprintf(stderr, "unindex %.*s\n", url_length, url);

  webbase_hook_mifluz_params_t* param_mifluz = (webbase_hook_mifluz_params_t*)params->data;

  if(param_mifluz->content_length <= 0) return 1;

  WordKey& key = param_mifluz->wordRef->Key();
  /*
   * Prepare the deletion context
   */
  param_mifluz->wordRef->Clear();
  key.Set(POS_URL,id);
  key.Set(POS_Server,params->server_id);
  key.Set(POS_Location, WORD_KEY_VALUE_INVALID);
  
  /*
   * Invoke the HTML parser
   */
  html_content_t walk;

  html_content_reset(&walk);
  walk.parser.info = HTML_SOURCE_STRING;
  walk.parser.source = param_mifluz->content;
  walk.parser.source_length = param_mifluz->content_length;
  walk.content_callback = hook_content_collect;
  walk.content_data = (void*)params;

  param_mifluz->action = ACTION_DELETE;
  param_mifluz->content_length = 0;
  
  return html_content_parse(&walk);
}

static int hook_rehook_start(webbase_hook_params_t* params)
{
  return 1;
}

static int hook_rehook_end(webbase_hook_params_t* params)
{
  return 1;
}

static int hook_rebuild_init(webbase_hook_params_t* params, const char* where)
{
  //
  // Clear indexing indicators in the database
  //
  static char* query = 0;
  static int query_size = 0;

  static_alloc(&query, &query_size, (where ? strlen(where) : 0) + 128);
  sprintf(query, "update url set hookid = 0 %s %s",
	  (where ? "where" : ""),
	  (where ? where : ""));
  if(params->verbose) fprintf(stderr, "%s\n", query);
  smysql_query(&params->base->mysql, query);

  return 1;
}

static int hook_rebuild_start(webbase_hook_params_t* params)
{
  webbase_hook_mifluz_params_t* params_mifluz = (webbase_hook_mifluz_params_t*)params->data;
  //
  // Reset the index itself
  //
  hook_close(params);
  int ret;
  if((ret = hook_open(params, O_RDWR|O_TRUNC)) != 0) return ret;
  params_mifluz->inv->BatchStart();
  return 1;
}

static int hook_rebuild_end(webbase_hook_params_t* params)
{
  if(params->verbose > 1) fprintf(stderr, "rebuild_end\n");
  webbase_hook_mifluz_params_t* params_mifluz = (webbase_hook_mifluz_params_t*)params->data;
  params_mifluz->inv->BatchEnd();
  return hook_close(params);
}

static int hook_info_set(webbase_hook_params_t* params, const char* info)
{
  return 1;
}

static int hook_end(webbase_hook_params_t* params)
{
  if(params->verbose > 1) fprintf(stderr, "end\n");
  return hook_close(params);
}

static int hook_server(webbase_hook_params_t* params, int server)
{
  params->server_id = server;
  return 1;
}

static int hook_close(webbase_hook_params_t* params)
{
  if(params->verbose > 1) fprintf(stderr, "close\n");
  webbase_hook_mifluz_params_t* params_mifluz = (webbase_hook_mifluz_params_t*)params->data;

  if(params_mifluz->inv) {
    if(params_mifluz->content) free(params_mifluz->content);
    delete params_mifluz->inv;
    params_mifluz->inv = 0;
  }
  return 1;
}

static int hook_open(webbase_hook_params_t* params, int flags)
{
  webbase_hook_mifluz_params_t* params_mifluz = (webbase_hook_mifluz_params_t*)params->data;
  if(!params_mifluz->inv) {
    params_mifluz->inv = params_mifluz->context->List();

    return params_mifluz->inv->Open(params_mifluz->index_base, flags);
  } else {
    return 0;
  }
}

static int hook_getopt(webbase_hook_params_t* params, int argc, char** argv)
{
  webbase_hook_mifluz_params_t* params_mifluz = (webbase_hook_mifluz_params_t*)params->data;
  
  opterr = 0;
  optind = 0;
  while(1) {
    /* `getopt_long' stores the option index here. */
    int option_index = 0;
    int c;

    c = getopt_long_only(argc, argv, "-", long_options, &option_index);

    /* Detect the end of the options. */
    if (c == -1)
      break;
     
    switch (c)
      {
      case 0:
	/* If this option set a flag, do nothing else now. */
	
	if (long_options[option_index].flag != 0)
	  break;
	if(!strcmp(long_options[option_index].name, "verbose_hooks")) {
	  params->verbose++;
	  break;
	} else if(!strcmp(long_options[option_index].name, "index_file")) {
	  params_mifluz->index_base = optarg;
	  break;
	} else if(!strcmp(long_options[option_index].name, "index_config")) {
	  params_mifluz->index_config = optarg;
	  break;
	}
      default:
	/* Just ignore all configuration flags we don't know about. */
	break;
      }
  }

  static ConfigDefaults defaults[] = {
    { "wordlist_extend", "true", 0 },
    { "wordlist_minimum_word_length", "1", 0 },
    { "wordlist_maximum_word_length", "25", 0 },
    { "wordlist_cache_size", "10000000", 0 }, // 10MB
    { "wordlist_cache_max", "100000000", 0 }, // 100MB

    { "wordlist_page_size", "8192", 0 },     // 8KB
    { "wordlist_compress", "1", 0 },
    { "wordlist_wordkey_description","Word 24/Server 24/URL 32/Location 24"},
    { 0, 0, 0 }
  };

  params_mifluz->index_dir = DEFAULT_INDEX_DIR;
  if(params_mifluz->index_base.empty())
    params_mifluz->index_base = params_mifluz->index_dir + String("/index");

  Configuration config;
  config.Defaults(defaults);

  if(params_mifluz->index_config.empty()) {
    if(!getenv("MIFLUZ_CONFIG") && file_exists(DEFAULT_CONFIG_FILE))
      params_mifluz->index_config = DEFAULT_CONFIG_FILE;
    else
      params_mifluz->index_config = WordContext::ConfigFile();
  }

  if(!params_mifluz->index_config.empty()) {
    if(file_exists((char*)params_mifluz->index_config)) {
      if(params->verbose > 1) fprintf(stderr, "hook_getopt: using mifluz configuration file %s\n", (char*)params_mifluz->index_config);
      config.Read(params_mifluz->index_config);
    } else {
      fprintf(stderr, "hook_getopt: -index_config %s file does not exist\n", (char*)params_mifluz->index_config);
    }
  } else {
    if(params->verbose) fprintf(stderr, "hook_getopt: no -index_config parameter, using builtin defaults\n");
  }

  params_mifluz->context = new WordContext(config);
  params_mifluz->wordRef = params_mifluz->context->Word();
  
  return 1;
}

void* hooksmifluz_init()
{
  webbase_hook_params_t* params = (webbase_hook_params_t*)smalloc(sizeof(webbase_hook_params_t));

  memset((char*)params, '\0', sizeof(webbase_hook_params_t));

  params->init = hook_init;
  params->ok = hook_ok;
  params->insert = hook_insert;
  params->insert_watch = hook_insert_watch;
  params->update = hook_update;
  params->prepare = hook_prepare;
  params->delete_id = hook_delete_id;
  params->rehook_start = hook_rehook_start;
  params->rehook_end = hook_rehook_end;
  params->rebuild_init = hook_rebuild_init;
  params->rebuild_start = hook_rebuild_start;
  params->rebuild_end = hook_rebuild_end;
  params->info_set = hook_info_set;
  params->server = hook_server;
  params->url2server = 0;
  params->tend = hook_end;
  params->getopt = hook_getopt;

  webbase_hook_mifluz_params_t* params_mifluz = (webbase_hook_mifluz_params_t*)smalloc(sizeof(webbase_hook_mifluz_params_t));

  memset((char*)params_mifluz, '\0', sizeof(webbase_hook_mifluz_params_t));

  params->name = "hooksmifluz";
  params->data = (void*)params_mifluz;

  return (void*)params;
}

#endif /* HAVE_MIFLUZ_H */
