/*
 *   Copyright (C) 1997, 1998, 1999 Loic Dachary
 *
 *   This program is free software; you can redistribute it and/or modify it
 *   under the terms of the GNU General Public License as published by the
 *   Free Software Foundation; either version 2, or (at your option) any
 *   later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program; if not, write to the Free Software
 *   Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 
 *
 */
/*
 * Bind TCP/IP reading to database structures
 */
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif /* HAVE_CONFIG_H */

#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif /* HAVE_STDLIB_H */
#include <ctype.h>
#include <string.h>
#include <stdio.h>
#ifdef HAVE_DMALLOC_H
#include <dmalloc.h>
#endif /* HAVE_DMALLOC_H */

#include <split.h>
#include <uri.h>
#include <creatp.h>
#include <write_file.h>
#include <salloc.h>

#include <sqlutil.h>
#include <http.h>
#include <webbase_url.h>
#include <html.h>
#include <webtools.h>

static int verbose = 0;

void http_verbose(int level)
{
  verbose = level;
}

int http_header(char* arg, int, char* buffer, int buffer_size, int flag)
{
  http_context_t* context = (http_context_t*)arg;
  webbase_url_t* webbase_url = context->webbase_url;
  cookies_t* cookies = context->cookies;

  char* from = buffer;
  char* to = buffer;

  if(flag == WEBTOOLS_READER_START) return 0;

  if(verbose > 1) fprintf(stderr, "http_header: got %.*s\n", buffer_size, buffer);

  while(*from) {
    if(from[0] == '\r' && from[1] == '\n') {
      *to++ = '\n';
      from += 2;
      buffer_size--;
    } else {
      *to++ = *from++;
    }
  }
  *to = '\0';

  if(verbose) fprintf(stderr, "http_header: reduced to %.*s\n", buffer_size, buffer);

  {
    char** lines;
    int lines_count;
    int i;
    split_inplace(buffer, buffer_size, &lines, &lines_count, '\n', SPLIT_TRIM);

    /*
     * Interpret status line
     */
    {
      char* tmp;
      if((tmp = strchr(lines[0], ' '))) {
	int code = atoi(tmp);
	/*
	 * set url code (not found, not modified, ...)
	 */
	webbase_url_code_set(webbase_url, code);
      } else {
	fprintf(stderr, "http_header: could not find code in %s, assume ok\n", lines[0]);
	
      }
    }
    /*
     * Load fields
     */
    for(i = 1; i < lines_count; i++) {
      char* key = lines[i];
      char* value;
      if((value = strchr(lines[i], ':'))) {
	char* tmp;
	tmp = value++;
	*tmp = ' ';
	while(tmp > lines[i] && isspace((int)*tmp)) *tmp-- = '\0';
	while(isspace((int)*value)) value++;
	for(tmp = key; *tmp; tmp++) {
	  if(isalpha((int)*tmp)) *tmp = tolower(*tmp);
	}
	if(!strcmp(key, "content-type")) {
	  webbase_url_content_type_set(webbase_url, value);
	} else if(!strcmp(key, "content-length")) {
	  webbase_url_content_length_set(webbase_url, atoi(value));
	} else if(!strcmp(key, "last-modified")) {
	  webbase_url_mtime_set(webbase_url, value);
	} else if(!strcmp(key, "set-cookie")) {
	  cookies_load(cookies, webbase_url->w_url, value);
	  webbase_url->w_info |= WEBBASE_URL_INFO_COOKIE;
	} else if(!strcmp(key, "location")) {
	  webbase_url_location_set(webbase_url, value);
	}
      } else {
	fprintf(stderr, "http_header: missing : at %s\n", lines[i]);
      }
    }
  }
  return 0;
}

#define HTTP_BODY_PARSE_LENGTH 10240

int http_body(char* arg, int, char* body, int body_size, int flag)
{
  http_context_t* context = (http_context_t*)arg;
  webbase_url_t* webbase_url = context->webbase_url;
  static char parse_buffer[HTTP_BODY_PARSE_LENGTH + 1];
  static int parse_buffer_length;
  static int parse_buffer_accumulate = 0;

#ifdef WEBBASE_CONTENT_BASE
  if(flag == WEBTOOLS_READER_START) {
    if(webbase_url->content) webbase_url->content[0] = '\0';
    webbase_url->content_length = 0;
  }
  
  if(body_size > 0) {
    static_alloc(&webbase_url->content, &webbase_url->content_size, webbase_url->content_length + body_size);
    memcpy(webbase_url->content + webbase_url->content_length, body, body_size);
    webbase_url->content_length += body_size;
    webbase_url->w_info |= WEBBASE_URL_INFO_CONTENT;
  }

  if(flag == WEBTOOLS_READER_END &&
     webbase_url->content_length > 0) {
    MYSQL* mysql = &context->crawl->base->mysql;
    char* query = smalloc(webbase_url->content_length * 2 + 128);
    int query_length = 0;
#ifdef TABLE_SPLIT
    sprintf(query, "replace into url_content%02d values (%d, '", (webbase_url->w_rowid % TABLE_SPLIT_SIZE), webbase_url->w_rowid);
#else /* TABLE_SPLIT */
    sprintf(query, "replace into url_content values (%d, '", webbase_url->w_rowid);
#endif /* TABLE_SPLIT */
    query_length += strlen(query);
    int length = mysql_real_escape_string(mysql, query + query_length, webbase_url->content, webbase_url->content_length);
    query_length += length;
    sprintf(query + query_length, "')");
    smysql_query(mysql, query);
  }
#else /* WEBBASE_CONTENT_BASE */
  if(flag == WEBTOOLS_READER_START) {
    /*
     * get file name for storage
     */
    webbase_url->path = webbase_url_file(webbase_url->w_rowid);
    if(webbase_url->path) {
      /*
       * create file and intermediate directories
       */
      creatp(webbase_url->path);
      write_file(webbase_url->path, body, body_size);
      webbase_url->w_info |= WEBBASE_URL_INFO_CONTENT;
    } else {
      fprintf(stderr, "cannot cache %s, unable to resolve path (ignored)\n", webbase_url->w_url);
    }
  } else if(webbase_url->path && body_size > 0 &&
	    (flag == WEBTOOLS_READER_CONTINUE || flag == WEBTOOLS_READER_END)) {
    /* 
     * "read-in-progress", just append body to file
     */
    append_file(webbase_url->path, body, body_size);
  }
#endif /* WEBBASE_CONTENT_BASE */


  if(verbose) fprintf(stderr, "http_body: condition = 0x%x\n", flag);

  if(flag == WEBTOOLS_READER_START) {
    parse_buffer_accumulate = 1;
    memset(parse_buffer, '\0', HTTP_BODY_PARSE_LENGTH);
    parse_buffer_length = body_size > HTTP_BODY_PARSE_LENGTH ? HTTP_BODY_PARSE_LENGTH : body_size;
    memcpy(parse_buffer, body, parse_buffer_length);
    if(verbose) fprintf(stderr, "http_body: parse_buffer = %s\n", parse_buffer);
  } else if(parse_buffer_accumulate &&
     (flag == WEBTOOLS_READER_CONTINUE || flag == WEBTOOLS_READER_END)) {
    int maximum = HTTP_BODY_PARSE_LENGTH - parse_buffer_length;
    int length = body_size > maximum ? maximum : body_size;
    memcpy(parse_buffer + parse_buffer_length, body, length);
    if(verbose) fprintf(stderr, "http_body: collect = %d more bytes (%.*s)\n", length, length, body);
    parse_buffer_length += length;
    if(verbose) fprintf(stderr, "http_body: parse_buffer = %s\n", parse_buffer);
  }

  if(parse_buffer_accumulate &&
     (parse_buffer_length >= HTTP_BODY_PARSE_LENGTH || flag == WEBTOOLS_READER_END)) {
    if(verbose > 1) fprintf(stderr, "http_body: accumulate %.*s\n", parse_buffer_length, parse_buffer);
    /*
     * Stop data collection
     */
    parse_buffer_accumulate = 0;
    /*
     * Extract meta information & all. Don't try to find them above 10k.
     */
    html_content_begin(parse_buffer, parse_buffer_length, webbase_url);
  }

  webbase_url->w_info |= WEBBASE_URL_INFO_READING;

  return 0;
}
