/*
 *   Copyright (C) 1997, 1998, 1999, 2000
 *   	Free Software Foundation, Inc.
 *
 *   This program is free software; you can redistribute it and/or modify it
 *   under the terms of the GNU General Public License as published by the
 *   Free Software Foundation; either version 2, or (at your option) any
 *   later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program; if not, write to the Free Software
 *   Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 
 *
 */
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif /* HAVE_CONFIG_H */

#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif /* HAVE_STDLIB_H */
#include <stdio.h>
#include <string.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>
#include <errno.h>

#include <salloc.h>
#include <logfile.h>

#include <getopttools.h>
#include <crawl.h>
#include <webbase.h>
#include <webbase_create.h>
#include <sqlutil.h>

#define MAX_OPTIONS 200
#define APPLICATION_OPTIONS		0x8000000

typedef struct crawler_params {
  crawl_params_t* crawl;
  int home_pages;
  int externals;
  int unload;
  int schema;
  int create;
  int unload_keep_start;
  int rehook;
  int rebuild;
  char* where_touch;
  int touch;
  int remove_unloaded;
  char* where_start;
  char* where_url;
  char* log;
  int show;
  char* show_fields;
  char* show_where;
  int show_indexable;
  int crawlers;
  int crawlers_chunk;
  int crawlers_nproc;
} crawler_params_t;

static crawler_params_t params;

static int verbose = 0;

static void init(int argc, char** argv);
static void finish();
static void crawlers(int argc, char** argv);

int main(int argc, char** argv)
{
  crawl_params_t* crawl;

  init(argc, argv);

  if(params.schema) {
    webbase_schema();
    exit(0);
  }

  crawl = params.crawl;

  if(params.home_pages) {
    hp_load_1(crawl, params.where_start);
  } else if(params.crawlers) {
    crawlers(argc, argv);
  } else if(params.create) {
    const char** schema = 0;
    int schema_length = 0;
    if(optind < argc) {
      schema = (const char**)(argv + optind);
      schema_length = argc - optind;
    }
    webbase_create(&crawl->base->mysql, schema, schema_length);
  } else if(params.rehook) {
    crawl_rehook(crawl);
  } else if(params.rebuild) {
    crawl_rebuild(crawl, params.rebuild, params.where_url);
  } else if(params.where_touch) {
    crawl_urls(crawl, params.where_touch);
  } else if(params.externals) {
    hp_print_externals(crawl, params.where_start);
  } else {
    int i;
    uri_t* url_object = uri_alloc("http://fake.net/", 16);
    for(i = optind; i < argc; i++) {
      char* url;
      if(uri_realloc(url_object, argv[i], strlen(argv[i])) != URI_CANNONICAL) {
	fprintf(stderr, "crawler: cannnot cannonicalize %s, ignored\n", argv[i]);
      }
      url = uri_uri(url_object);
      if(params.remove_unloaded)
	hp_remove_unloaded(crawl, url);
      else if(params.touch)
	crawl_touch(crawl, url);
      else if(params.unload)
	hp_unload(crawl, url, params.unload_keep_start);
      else if(params.show_indexable)
	hp_show_indexable(crawl, url);
      else if(params.show)
	hp_show(crawl, url, params.show_fields, params.show_where);
      else
	hp_load_in_core(crawl, url);
    }
  }
  finish();
  return 0;
}

void finish()
{
  crawl_free(params.crawl);
  if(params.log) free(params.log);
  if(params.where_start) free(params.where_start);
  if(params.where_touch) free(params.where_touch);
  if(params.show_fields) free(params.show_fields);
  if(params.show_where) free(params.show_where);
  exit(0);
}

extern "C" {
void* hooksmifluz_init();
}

static void init(int argc, char** argv)
{
  static struct option long_options[MAX_OPTIONS + 1] =
  {
    /* These options set a flag. */
    {"verbose", 0, &verbose, 1},
    {"log", 1, 0, 0},
    {"help", 0, 0, 0},
    {"where_start", 1, 0, 0},
    {"where_url",1, 0, 0},
    {"where_touch", 1, 0, 0},
    {"unload", 0, 0, 0},
    {"unload_keep_start", 0, 0, 0},
    {"create", 0, &params.create, 1},
    {"schema", 0, &params.schema, 1},
    {"rehook", 0, &params.rehook, 1},
    {"rebuild", 0, &params.rebuild, 1},
    {"rebuild_resume", 0, &params.rebuild, 2},
    {"externals", 0, &params.externals, 1},
    {"home_pages", 0, &params.home_pages, 1},
    {"remove_unloaded", 0, &params.remove_unloaded, 1},
    {"touch", 0, &params.touch, 1},
    {"show", 0, &params.show, 1},
    {"show_fields", 1, 0, 0},
    {"show_where", 1, 0, 0},
    {"show_indexable", 0, &params.show_indexable, 1},
    {"crawlers", 0, &params.crawlers, 1},
    {"crawlers_chunk", 1, 0, 0},
    {"crawlers_nproc", 1, 0, 0},
    {0, MAX_OPTIONS, 0, APPLICATION_OPTIONS}
  };

  char buf[8];
  snprintf(buf,8,"%d",MAX_OPTIONS);

  static struct option_help long_help_options[MAX_OPTIONS + 1] =
  {
    /* These options set a flag. */
    {"verbose", "main program debug messages."},
    {"log", "write log output in log file <file>."},
    {"help", "display this message."},
    {"where_start <where clause>", "only consider those Home Pages that match the <where clause> restriction."},
    {"where_url <where clause>", "associated with -rebuild option, only rebuild URLs that match the <where clause> restriction."},
    {"where_touch <where clause>", "crawl URLs matching the argument where clause in the url table"},
    {"unload", "remove the starting point and all the URLs linked to it."},
    {"unload_keep_start", "same as -unload except that starting point is left in the start table."},
    {"create", "create all the tables"},
    {"schema", "output the schema of the database"},
    {"rehook", "check and fix the concordance between the meta information database and the fulltext index."},
    {"rebuild", "remove all the records from the full text database and resubmit all the URLs for indexing."}, 
    {"rebuild_resume", "resume from an interrupted -rebuild."},
    {"externals", "print HREFs contained in the document located at the URL <url>"},
    {"home_pages", "load all the URLs listed in the start table."},
    {"remove_unloaded", "remove from database all URLs crawled starting from starting point <url>."},
    {"touch", "crawl URLs given in argument."},
    {"show", "show fields of start, url and url_complete matching url argument."},
    {"show_fields", "show specified fields of start, url and url_complete."},
    {"show_where", "only select records matching the constraint."},
    {"show_indexable", "display the HTML stripped document content"},
    {"crawlers", "run multiple crawlers"},
    {"crawlers_chunk <number>", "crawl <number> start entries in each crawler"},
    {"crawlers_nproc <number>", "run <number> simultaneous crawlers"},
    {"0", buf}
  };


  getopt_merge(long_options, crawl_options(long_options));
  getopt_help_merge(long_help_options, crawl_help_options(long_help_options));

  opterr = 0;
  optind = 0;
  while(1) {
    /* `getopt_long' stores the option index here. */
    int option_index = 0;
    int c;

    c = getopt_long_only(argc, argv, "-", long_options, &option_index);

    /* Detect the end of the options. */
    if (c == -1)
      break;
     
    switch (c)
      {
      case 0:
	/* If this option set a flag, do nothing else now. */
	if (long_options[option_index].flag != 0)
	  break;
	if(!strcmp(long_options[option_index].name, "log")) {
	  params.log = strdup(optarg);
	} else if(!strcmp(long_options[option_index].name, "help")) {
	  getopt_dump(long_help_options);
	  exit(0);
	} else if(!strcmp(long_options[option_index].name, "unload")) {
	  params.unload = 1;
	} else if(!strcmp(long_options[option_index].name, "unload_keep_start")) {
	  params.unload_keep_start = 1;
	  params.unload = 1;
	} else if(!strcmp(long_options[option_index].name, "where_start")) {
	  params.where_start = strdup(optarg);
	} else if(!strcmp(long_options[option_index].name, "where_url")) {
	  params.where_url = strdup(optarg);
	} else if(!strcmp(long_options[option_index].name, "where_touch")) {
	  params.where_touch = strdup(optarg);
	} else if(!strcmp(long_options[option_index].name, "show_fields")) {
	  params.show_fields = strdup(optarg);
	} else if(!strcmp(long_options[option_index].name, "show_where")) {
	  params.show_where = strdup(optarg);
	} else if(!strcmp(long_options[option_index].name, "crawlers_chunk")) {
	  params.crawlers_chunk = atoi(optarg);
	} else if(!strcmp(long_options[option_index].name, "crawlers_nproc")) {
	  params.crawlers_nproc = atoi(optarg);
	} else if(!strcmp(long_options[option_index].name, "")) {
	}
	break;
      }
  }

  if(!params.schema) {
    params.crawl = crawl_alloc(argc, argv, long_options);
  }

  if(params.log) logfile(params.log);

  return;
}

static void crawlers(int argc, char** argv)
{
  int argc_child = 0;
  char** argv_child = (char**)smalloc(sizeof(char*) * (argc + 16));
  int i;

  if(params.crawlers_chunk == 0)
    params.crawlers_chunk = 50;
  if(params.crawlers_nproc == 0)
    params.crawlers_nproc = 5;

  /*
   * Pre-compute the children argv list from our list.
   */
  argv_child[argc_child++] = strdup("crawler[child]");
  for(i = 1; i < argc; i++) {
    if(!strcmp(argv[i], "-crawlers")) {
    } else if(!strcmp(argv[i], "-crawlers_chunk") ||
	      !strcmp(argv[i], "-crawlers_nproc") ||
	      !strcmp(argv[i], "-log") ||
	      !strcmp(argv[i], "-where_start")) {
      i++;
    } else {
      argv_child[argc_child++] = strdup(argv[i]);
    }
  }
  argv_child[argc_child++] = strdup("-home_pages");
  argv_child[argc_child++] = strdup("-log");
  if(!params.log) params.log = strdup("crawler.log");
  char* log = smalloc(strlen(params.log) + 4 + 1);
  argv_child[argc_child++] = log;
  argv_child[argc_child++] = strdup("-where_start");
  char* query = smalloc((params.where_start ? strlen(params.where_start) : 0) +
			256);
  argv_child[argc_child++] = query;
  argv_child[argc_child] = 0; 

  /*
   * Table of children pids
   */
  pid_t* child_pids = (pid_t*)smalloc(sizeof(pid_t) * params.crawlers_nproc);
  memset((char*)child_pids, '\0', sizeof(pid_t) * params.crawlers_nproc);

  /*
   * Get the rowid list from the mysql server
   */
  MYSQL* mysql = &params.crawl->base->mysql;
  MYSQL_RES *res;
  MYSQL_ROW row;
  int num_rows;
  int processed_rows = 0;
  sprintf(query, "select rowid from start where %s", (params.where_start ? params.where_start : " 1 = 1 "));

  smysql_query(mysql, query);
  res = smysql_store_result(mysql);
  if((num_rows = mysql_num_rows(res))) {
#define PID_LOW 0
#define PID_HIGH 1
    int pids_list[2];
    int pid_count = 0;
    if(verbose) fprintf(stderr, "exploring %d entries from table start\n", num_rows);
    while((row = mysql_fetch_row(res))) {
      processed_rows++; 
      if(!row[0]) {
	fprintf(stderr, "crawlers: unexpected null row[0]\n");
	continue;
      }
      if(pid_count == 0) pids_list[PID_LOW] = atoi(row[0]);
      pid_count++;

      /*
       * We've collected crawlers_chunk entries or we are at the end of
       * the list, run a child.
       */
      if(pid_count >= params.crawlers_chunk || processed_rows >= num_rows) {
	pids_list[PID_HIGH] = atoi(row[0]);
	sprintf(query, "( rowid >= %d and rowid <= %d ) and (%s)", pids_list[PID_LOW], pids_list[PID_HIGH], (params.where_start ? params.where_start : " 1 = 1 "));
	if(verbose) fprintf(stderr, "query = %s\n", query);

	int child_slot = -1;
	while(child_slot < 0) {
	  /*
	   * Get first free slot
	   */
	  for(i = 0; i < params.crawlers_nproc; i++) {
	    if(child_pids[i] == 0) {
	      child_slot = i;
	      break;
	    }
	  }

	  /*
	   * No free slot, wait for a child to die
	   */
	  if(child_slot < 0) {
	    int i;
	    int child_status;
	    pid_t child;
	    switch(child = wait(&child_status)) {
	    case 0:
	      fprintf(stderr, "crawlers: unexpected 0 return from wait\n");
	      exit(1);
	      break;
	    case (pid_t)-1:
	      fprintf(stderr, "crawlers: wait: ");
	      perror("");
	      exit(-1);
	      break;
	    default:
	      if(verbose) fprintf(stderr, "die pid=%08d status=0x%08x\n", child, child_status);
	      break;
	    }
	    for(i = 0; i < params.crawlers_nproc; i++) {
	      if(child_pids[i] == child) {
		child_pids[i] = 0;
		break;
	      }
	    }
	    if(i >= params.crawlers_nproc) {
	      fprintf(stderr, "crawlers: unexpected child %d\n", child);
	      exit(1);
	    }
	  }
	}

	/*
	 * fork/exec a child crawler
	 */
	switch((child_pids[child_slot] = fork())) {
	case 0:
	  sprintf(log, "%s%04d", params.log, child_slot);
	  if(verbose) fprintf(stderr, "running %s\n", query);
	  if(execvp(argv[0], argv_child) < 0) {
	    fprintf(stderr, "crawlers: execvp: ");
	    perror("");
	    exit(-1);
	  }
	  /* Unreached */
	  break;
	case (pid_t)-1:
	  fprintf(stderr, "crawlers: cannot run crawler: ");
	  perror("");
	  break;
	default:
	  if(verbose) fprintf(stderr, "run pid=%08d %s\n", child_pids[child_slot], query);
	  break;
	}
	pid_count = 0;
      }
    }
  }

  /*
   * Wait for all children to die.
   */
  {
    int status;
    pid_t child;
    while((child = wait(&status)) != (pid_t)-1) {
      if(verbose) fprintf(stderr, "die pid=%08d status=0x%08x\n", child, status);
    }
    if(errno != ECHILD) {
      perror("crawler: wait: ");
      exit(-1);
    }
  }

  mysql_free_result(res);
  free(child_pids);
  for(i = 0; i < argc_child; i++) {
    free(argv_child[i]);
  }
  free(argv_child);
}

