/*
 *   Copyright (C) 1997, 1998, 1999 Loic Dachary
 *
 *   This program is free software; you can redistribute it and/or modify it
 *   under the terms of the GNU General Public License as published by the
 *   Free Software Foundation; either version 2, or (at your option) any
 *   later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program; if not, write to the Free Software
 *   Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 
 *
 */
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif /* HAVE_CONFIG_H */

#include <stdio.h>
#include <errno.h>
#include <sys/types.h>
#include <string.h>
#ifdef HAVE_MALLOC_H
#include <malloc.h>
#endif /* HAVE_MALLOC_H */
#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif /* HAVE_STDLIB_H */
#ifdef HAVE_DMALLOC_H
#include <dmalloc.h>
#endif /* HAVE_DMALLOC_H */

#include <hash.h>
#include <split.h>
#include <uri.h>
#include <salloc.h>
#include <string2time.h>
#include <file_size.h>

#include <webbase_url.h>
#include <html.h>

static webbase_url_t* _webbase_url_object = 0;

static int verbose = 0;

webbase_url_t* webbase_url_object()
{
  if(_webbase_url_object == 0) {
    _webbase_url_object = (webbase_url_t*)smalloc(sizeof(webbase_url_t));
    memset((char*)_webbase_url_object, '\0', sizeof(webbase_url_t));
  } else {
    webbase_url_reset(_webbase_url_object, WEBBASE_URL_RESET_TOTAL);
  }
  return _webbase_url_object;
}

void webbase_url_free(webbase_url_t* webbase_url)
{
  if(webbase_url->w_url) free(webbase_url->w_url);
  if(webbase_url->w_base_url) free(webbase_url->w_base_url);
  if(webbase_url->w_relative) free(webbase_url->w_relative);
  if(webbase_url->w_absolute) free(webbase_url->w_absolute);
  if(webbase_url->w_location) free(webbase_url->w_location);
}

void webbase_copy_url(webbase_url_t* to, webbase_url_t* from)
{
  webbase_url_reset(to, WEBBASE_URL_RESET_TOTAL);
  
  to->w_rowid = from->w_rowid;
  static_alloc(&to->w_url, &to->w_url_length, from->w_url_length);
  strcpy(to->w_url, from->w_url);
  memcpy(to->w_url_md5, from->w_url_md5, MD5_ASCII_SIZE);
  to->w_info = from->w_info;
  to->w_code = from->w_code;
  to->w_mtime = from->w_mtime;
  to->w_mtime_error = from->w_mtime_error;
  to->w_tags = from->w_tags;
  strcpy(to->w_content_type, from->w_content_type);
  to->w_content_length = from->w_content_length;
  memcpy(to->w_md5, from->w_md5, MD5_ASCII_SIZE);
  to->w_complete_rowid = from->w_complete_rowid;
  to->w_crawl = from->w_crawl;
  to->w_hookid = from->w_hookid;
  strcpy(to->w_extract, from->w_extract);
  strcpy(to->w_language, from->w_language);
  strcpy(to->w_title, from->w_title);
  strcpy(to->w_keywords, from->w_keywords);
  strcpy(to->w_description, from->w_description);

  if(from->w_info & WEBBASE_URL_INFO_BASE) {
    static_alloc(&to->w_base_url, &to->w_base_url_length, from->w_base_url_length);
    strcpy(to->w_base_url, from->w_base_url);
  }

  if(from->w_info & WEBBASE_URL_INFO_RELATIVE) {
    static_alloc(&to->w_relative, &to->w_relative_length, from->w_relative_length);
    strcpy(to->w_relative, from->w_relative);
  }

  if(from->w_info & WEBBASE_URL_INFO_ABSOLUTE) {
    static_alloc(&to->w_absolute, &to->w_absolute_length, from->w_absolute_length);
    strcpy(to->w_absolute, from->w_absolute);
  }

  if(from->w_info & WEBBASE_URL_INFO_LOCATION) {
    static_alloc(&to->w_location, &to->w_location_length, from->w_location_length);
    strcpy(to->w_location, from->w_location);
  }
}

void webbase_url_reset(webbase_url_t* webbase_url, int flag)
{
  if(flag != WEBBASE_URL_RESET_PARTIAL) webbase_url->w_rowid = 0;
  if(flag != WEBBASE_URL_RESET_PARTIAL) {
    if(webbase_url->w_url) webbase_url->w_url[0] = '\0';
    memset(webbase_url->w_url_md5, '\0', MD5_ASCII_SIZE);
  }
  webbase_url->path = 0;
  webbase_url->w_info = 0;
  webbase_url->w_code = 0;
  if(flag != WEBBASE_URL_RESET_PARTIAL) webbase_url->w_mtime = 0;
  if(flag != WEBBASE_URL_RESET_PARTIAL) webbase_url->w_mtime_error = 0;
  if(flag != WEBBASE_URL_RESET_PARTIAL) webbase_url->w_tags = 0;
  webbase_url->w_content_type[0] = '\0';
  webbase_url->w_content_length = 0;
  memset(webbase_url->w_md5, '\0', MD5_ASCII_SIZE);
  if(flag != WEBBASE_URL_RESET_PARTIAL) webbase_url->w_complete_rowid = 0;
  webbase_url->w_crawl = 0;
  if(flag != WEBBASE_URL_RESET_PARTIAL) webbase_url->w_hookid = 0;
  strcpy(webbase_url->w_language, "unknown");
  webbase_url->w_extract[0] = '\0';
  webbase_url->w_title[0] = '\0';
  webbase_url->w_keywords[0] = '\0';
  webbase_url->w_description[0] = '\0';
  if(webbase_url->w_base_url) webbase_url->w_base_url[0] = '\0';
  if(webbase_url->w_relative) webbase_url->w_relative[0] = '\0';
  if(webbase_url->w_absolute) webbase_url->w_absolute[0] = '\0';
  if(webbase_url->w_location) webbase_url->w_location[0] = '\0';
}

/*
 * Run func on every href found in webbase_url.
 * Stops if func returns 0, continue otherwise.
 * Returns 1 if it was not stopped by a 0 return from func,
 * 0 otherwise (1 for : you can continue walking, 0 for : stop walking)
 */
int webbase_url_walk_href(webbase_url_t* webbase_url, int flag, webbase_url_walk_href_callback_t func, char* args)
{
  char** splitted;
  int count;
  int index;
  int end = 0;

  if((flag & WEBBASE_URL_WALK_ABSOLUTE) && (webbase_url->w_info & WEBBASE_URL_INFO_ABSOLUTE)) {
    char** tmp;
    split(webbase_url->w_absolute, strlen(webbase_url->w_absolute), &tmp, &count, ' ', SPLIT_TRIM);
    splitted = (char**)smalloc(count * sizeof(char*));
    memcpy(splitted, tmp, count * sizeof(char*));
    for(index = 0; index < count && !end; index++) {
      char* url = splitted[index];
      if(verbose) fprintf(stderr, "walk_href absolute: %s\n", url);
      end = !(*func)(args, webbase_url, url, WEBBASE_URL_WALK_ABSOLUTE);
    }
    free(splitted);
  }

  if(!end && ((flag & WEBBASE_URL_WALK_RELATIVE) && (webbase_url->w_info & WEBBASE_URL_INFO_RELATIVE))) {
    char** tmp;
    uri_t* url_object = uri_alloc(webbase_url->url, strlen(webbase_url->url));
    
    split(webbase_url->w_relative, strlen(webbase_url->w_relative), &tmp, &count, ' ', SPLIT_TRIM);
    splitted = (char**)smalloc(count * sizeof(char*));
    memcpy(splitted, tmp, count * sizeof(char*));
    for(index = 0; index < count && !end; index++) {
      char* url;
      if(verbose) fprintf(stderr, "walk_href relative: %s\n", splitted[index]);
      url = uri_uri(uri_abs(url_object, splitted[index], strlen(splitted[index])));
      end = !(*func)(args, webbase_url, url, WEBBASE_URL_WALK_RELATIVE);
    }

    free(splitted);
    uri_free(url_object);
  }

  return !end;
}

/*
 * Return 1 if url is suitable for crawl starting point, 0 otherwise.
 */
int webbase_url_start_ok(char* url, int url_length)
{
  uri_t* uri = uri_object(url, url_length);
  if(!uri) return 0;
  if(uri_info(uri) & URI_INFO_RELATIVE) {
    fprintf(stderr, "webbase_url_start_ok: a starting point cannot be a relative url\n");
    return 0;
  }
  if(strncmp(url, uri_uri(uri), url_length)) {
    fprintf(stderr, "webbase_url_start_ok: a starting point must be cannonical\n");
    return 0;
  }
  return 1;
}

void webbase_url_print(webbase_url_t* )
{
#if 0
  printf("url: %s\n", webbase_url->url);
#define S(f)  if(webbase_url->w_info & (f)) printf(#f "\n")
  S(WEBBASE_URL_INFO_FRAME);
  S(WEBBASE_URL_INFO_COMPLETE);
  S(WEBBASE_URL_INFO_COOKIE);
  S(WEBBASE_URL_INFO_BASE);
  S(WEBBASE_URL_INFO_RELATIVE);
  S(WEBBASE_URL_INFO_ABSOLUTE);
  S(WEBBASE_URL_INFO_LOCATION);
  S(WEBBASE_URL_INFO_TIMEOUT);
  S(WEBBASE_URL_INFO_NOT_MODIFIED);
  S(WEBBASE_URL_INFO_NOT_FOUND);
  S(WEBBASE_URL_INFO_HTTP);
  S(WEBBASE_URL_INFO_FTP);
  S(WEBBASE_URL_INFO_NEWS);
#undef S
#define S(f)  if(webbase_url->w_code & (f)) printf(#f "\n")
  S(WEBBASE_URL_REDIRECTED);
  S(WEBBASE_URL_FRAME);
  S(WEBBASE_URL_SUCCESS);
  S(WEBBASE_URL_FATAL);
#undef S
  printf("code: %d\n", WEBBASE_URL_CODE(webbase_url->w_code));
  printf("last-modified: %s", ctime(&webbase_url->w_mtime));
  if(webbase_url->w_mtime_error) printf("last error time : %s", ctime(&webbase_url->w_mtime_error));
  printf("content-type: %s\n", webbase_url->w_content_type);
  if(webbase_url->w_info & WEBBASE_URL_INFO_ABSOLUTE)
    printf("absolute: %.*s\n", webbase_url->w_absolute_length, webbase_url->w_absolute);
#endif
}

void webbase_url_code_set(webbase_url_t* webbase_url, int code)
{
  switch(code) {

  case WEBBASE_URL_CODE_OK:
  case WEBBASE_URL_CODE_CREATED:
  case WEBBASE_URL_CODE_ACCEPTED:
  case WEBBASE_URL_CODE_NON_AUTHORITATIVE_INFORMATION:
  case WEBBASE_URL_CODE_NO_CONTENT:
  case WEBBASE_URL_CODE_RESET_CONTENT:
  case WEBBASE_URL_CODE_PARTIAL_CONTENT:
    webbase_url_reset(webbase_url, WEBBASE_URL_RESET_PARTIAL);
    webbase_url->w_info |= WEBBASE_URL_INFO_OK;
    break;

  case WEBBASE_URL_CODE_MULTIPLE_CHOICES:
  case WEBBASE_URL_CODE_MOVED_PERMANENTLY:
  case WEBBASE_URL_CODE_MOVED_TEMPORARILY:
  case WEBBASE_URL_CODE_SEE_OTHER:
    webbase_url_reset(webbase_url, WEBBASE_URL_RESET_PARTIAL);
    break;
  case WEBBASE_URL_CODE_NOT_MODIFIED:
    webbase_url->w_info |= WEBBASE_URL_INFO_NOT_MODIFIED;
    break;
  case WEBBASE_URL_CODE_USE_PROXY:

  case WEBBASE_URL_CODE_CONTINUE:
  case WEBBASE_URL_CODE_SWITCHING_PROTOCOLS:

  case WEBBASE_URL_CODE_BAD_REQUEST:
  case WEBBASE_URL_CODE_UNAUTHORIZED:
  case WEBBASE_URL_CODE_PAYMENT_REQUIRED:
  case WEBBASE_URL_CODE_FORBIDDEN:
    webbase_url_reset(webbase_url, WEBBASE_URL_RESET_PARTIAL);
    webbase_url->w_info |= WEBBASE_URL_INFO_ERROR;
    break;
  case WEBBASE_URL_CODE_NOT_FOUND:
    webbase_url_reset(webbase_url, WEBBASE_URL_RESET_PARTIAL);
    webbase_url->w_info |= WEBBASE_URL_INFO_ERROR | WEBBASE_URL_INFO_NOT_FOUND;
    break;
  case WEBBASE_URL_CODE_METHOD_NOT_ALLOWED:
  case WEBBASE_URL_CODE_NOT_ACCEPTABLE:
  case WEBBASE_URL_CODE_PROXY_AUTHENTICATION_REQUIRED:
  case WEBBASE_URL_CODE_REQUEST_TIMEOUT:
  case WEBBASE_URL_CODE_CONFLICT:
  case WEBBASE_URL_CODE_GONE:
  case WEBBASE_URL_CODE_LENGTH_REQUIRED:
  case WEBBASE_URL_CODE_PRECONDITION_FAILED:
  case WEBBASE_URL_CODE_REQUEST_ENTITY_TOO_LARGE:
  case WEBBASE_URL_CODE_REQUEST_URI_TOO_LARGE:
  case WEBBASE_URL_CODE_UNSUPPORTED_MEDIA_TYPE:

  case WEBBASE_URL_CODE_INTERNAL_SERVER_ERROR:
  case WEBBASE_URL_CODE_NOT_IMPLEMENTED:
  case WEBBASE_URL_CODE_BAD_GATEWAY:
  case WEBBASE_URL_CODE_SERVICE_UNAVAILABLE:
  case WEBBASE_URL_CODE_GATEWAY_TIMEOUT:
  case WEBBASE_URL_CODE_HTTP_VERSION_NOT_SUPPORTED:
  default:
    webbase_url_reset(webbase_url, WEBBASE_URL_RESET_PARTIAL);
    webbase_url->w_info |= WEBBASE_URL_INFO_ERROR;
    break;

  case WEBBASE_URL_CODE_CONNECTION_REFUSED:
  case WEBBASE_URL_CODE_CONNECTION_TIMED_OUT:
    webbase_url->w_info |= WEBBASE_URL_INFO_TIMEOUT;
    break;
  }

  webbase_url->w_code = code;
}

int webbase_url_robots_p(webbase_url_t* webbase_url)
{
  static char* robots = "robots.txt";
  static int robots_length = 10;

  int url_length = strlen(webbase_url->w_url);
  if(url_length > robots_length &&
     !strcmp(&webbase_url->w_url[url_length - robots_length], robots))
    return 1;
  else
    return 0;
}

void webbase_url_content_type_set(webbase_url_t* webbase_url, char* content_type)
{
  strncpy(webbase_url->w_content_type, content_type, WEBBASE_CONTENT_TYPE_LENGTH);
  /*
   * Strip trailing '; something' if any.
   * May be used in the future if it contains a document encoding 
   * specification.
   */
  {
    char* sep = strchr(webbase_url->w_content_type, ';');
    if(sep) {
      do {
	*sep-- = '\0';
      } while(sep >= webbase_url->w_content_type && (*sep == ' ' || *sep == '\t'));
    }
  }
  webbase_url->w_content_type[WEBBASE_CONTENT_TYPE_LENGTH] = '\0';
}

void webbase_url_mtime_set(webbase_url_t* webbase_url, char* mtime)
{
  webbase_url->w_mtime = string2time(mtime);
}

void webbase_url_location_set(webbase_url_t* webbase_url, char* location)
{
  static uri_t* location_object = 0;
  int location_cannonical = URI_CANNONICAL;
  int location_length = strlen(location);
  char* url;

  if(!location_object) {
    location_object = uri_alloc(location, location_length);
  } else {
    location_cannonical = uri_realloc(location_object, location, location_length);
  }

  if(location_cannonical != URI_CANNONICAL || !location_object) {
    fprintf(stderr, "url_location_set: %s is not a valid URL\n", location);
    return;
  }
  if(location_object->info & URI_INFO_RELATIVE) {
    uri_t* url_object = uri_object(webbase_url->w_url, strlen(webbase_url->w_url));
    url = uri_uri(uri_abs_1(url_object, location_object));
  } else {
    url = location;
  }

  if(!strcmp(webbase_url->w_url, url)) {
    fprintf(stderr, "url_location_set: %s points to itself, ignored\n", url);
    return;
  }

  static_alloc(&webbase_url->w_location, &webbase_url->w_location_length, strlen(url) + 1);
  strcpy(webbase_url->w_location, url);
  webbase_url->w_info |= WEBBASE_URL_INFO_LOCATION | WEBBASE_URL_INFO_COMPLETE;
}

void webbase_url_content_length_set(webbase_url_t* webbase_url, int content_length)
{
  webbase_url->w_content_length = content_length;
}

void webbase_url_content_length_fix(webbase_url_t* webbase_url, char* path)
{
  if(webbase_url->w_content_length == 0) {
    webbase_url->w_content_length = file_size(path);
  }
}

/*
 * If the server is not able to tell us what is the last modification
 * date, set it to the last load time.
 */
void webbase_url_mtime_fix(webbase_url_t* webbase_url)
{
  if(webbase_url->w_mtime == 0) {
    webbase_url->w_mtime = time(0);
  }
}

void webbase_url_hrefs_set(webbase_url_t* webbase_url, char* path, int size_hrefs)
{
  html_href(path, webbase_url, size_hrefs);
}

char* webbase_url_file(webbase_url_t* webbase_url)
{
  static char* file = 0;
  static int file_size = 0;

  static_alloc(&file, &file_size, strlen(uri_get_root()) + 1 + 32 + 1);

  int rowid = webbase_url->w_rowid;

  sprintf(file, "%s%s%02x/%02x/%02x/%02x",
	  uri_get_root(),
	  (uri_get_root()[0] == '\0' ? "" : "/"),
	  ((rowid >> 24) & 0xff),
	  ((rowid >> 16) & 0xff),
	  ((rowid >> 8) & 0xff),
	  (rowid & 0xff));

  return file;
}
