//______________________________________________________________________________

//	Java Virtual Shelf
//______________________________________________________________________________

package org.demo.webwader;

import org.ariane.tools.*;
import java.io.*;
import java.util.*;
import java.net.HttpURLConnection;
import javax.swing.text.*;
import javax.swing.text.html.*;
import javax.swing.text.html.parser.ContentModel;
import javax.swing.tree.*;

/**
 * SiteScanner : a tool used to scan the content of a web site.
 * <p>
 * A site is defined by the topmost node of a tree of pages.
 * @see WebSite
 * @see CheckSite
 * @see WebPage
 * @version $Id: SiteScanner.java,v 3.4 2000/11/12 10:23:46 lefevre Exp $
 * @author Jean-Paul Le Fvre
 */

public class SiteScanner extends Nobject {
  /**
   * @serial The unique instance.
   */
private static SiteScanner scanner = null;
  /**
   * @serial The top of the hierarchy of web pages.
   */
private WebNode top;
  /**
   * @serial The url of the top node directory.
   */
private String top_dir_url = null;
  /**
   * @serial The number of pages in the site.
   */
private int node_nbr;
  /**
   * @serial The maximum number of pages allowed in a site.
   */
private int max_node_nbr;
  /**
   * @serial The maximum directory depth allowed in a site.
   */
private int max_dir_depth;
  /**
   * @serial The list of strings marking ignored locations.
   */
private Collection ignored;
  /**
   * @serial The list of frames found.
   */
private Collection frames;
  /**
   * @serial The flag indicating that a scan must be stopped.
   */
private boolean give_up = false;
  /**
   * @serial The current scanning order.
   */
private int scan_order = WebNode.BREADTH_FIRST;
  /**
   * @serial The path of recursion in a post order scan.
   */
private MiniSet scan_path = null;
    /**
     * @serial The current printer.
     */
private	PrintWriter  printer;

//______________________________________________________________________________
/**
 * Gets an unique instance of the scanner.
 * <Br>
 * It is a singleton.
 * @return the unique instance.
 */
public static SiteScanner instance()
  {
    if(scanner == null) scanner = new SiteScanner();

    return scanner;
  }
//______________________________________________________________________________
/**
 * Creates the scanner.
 * <P>
 * The maximum number of nodes allowed in a site is found in the properties,
 * as well as the maximum directory depth.
 * @see #loadIgnoredList
 */
private SiteScanner()
  {
      super("SiteScanner");
      printer       = new PrintWriter(System.out);
      top           = null;
      node_nbr      = 0;
      max_node_nbr  = Resources.instance().get(
		      Resources.PREFIX + "MaxNodeNbr", 200);
      max_dir_depth = Resources.instance().get(
		      Resources.PREFIX + "MaxDirDepth", 10);
  }
//______________________________________________________________________________
/**
 * Changes the current printer.
 * <P>
 * A new printer can be used to send message to a text area widget instead
 * of a terminal.
 * @param printer the new printer.
 */
final public void setPrinter(PrintWriter printer)
    {
	this.printer = printer;
    }
//______________________________________________________________________________
/**
 * Builds the tree of pages knowing the root of this site.
 * <P>
 * The url passed must be absolute. Relative path to a file are
 * not accepted. From the page specified by the url, the whole site
 * is analyzed.
 * <P>
 * The site can be scanned in 3 differents order : breadth first,
 * preorder and postorder. By default breadth first is selected.
 * Postorder should be avoided. I'm sure of the algorithm :(
 *
 * @param sitename the name of the site.
 * @param url the url of the site.
 * @param order the order used to scan the hierarchy.
 * @return the topmost node.
 * @throws InvalidLocatorException is a location is not valid.
 * @see #scanPage
 */
public WebNode scan(String sitename, String url, int order)
                                                 throws InvalidLocatorException
  {
      long ti         = System.currentTimeMillis();
      node_nbr        = 1;
      EditorKit kit   = new HTMLEditorKit();
      give_up	      = false;
      scan_order      = order;
      top_dir_url     = null;

      loadIgnoredList();
      loadIgnoredList(
      Resources.instance().get(Resources.PREFIX + sitename +".Ignored"));
      if(ToolBox.debug) ToolBox.warn("Ignored : " + scanner.getIgnoredList());

      /**
       * Don't let the server manage automatically the links.
       */
      HttpURLConnection.setFollowRedirects(false);
      Locator locator = Locator.makeLocator(url);

      String redirect = locator.getRedirectedLocation();
      if(redirect != null) locator = Locator.makeLocator(redirect);
      if(! locator.isReadable()) {
	  throw new InvalidLocatorException("Not readable " + locator);
      }

      top    = new WebNode(locator);
      frames = new ArrayList();

      if(scan_order == WebNode.POSTORDER)
	  scan_path = new MiniSet(max_node_nbr);

      scanPage(kit, top); // Let's go !

      if(frames.size() > 0)
	  mergeFrames();

      long dt = (System.currentTimeMillis() - ti) / 1000;

      if(ToolBox.verbose) {
	  printer.println();
	  printer.println("Found " + node_nbr + " nodes.");
	  printer.println("Scan done in " + dt + " s.");
      }

      if(ToolBox.debug) {
	  printer.println(top.getVolume());
      }

      printer.flush();

      return top;
  }
//______________________________________________________________________________
/**
 * Scans a document and extracts nodes from it.
 * <P>
 * It is called recursively to traverse the whole site.
 * If the give up flag is set to true, it returns immediatly.
 * 
 * @param kit the common EditorKit.
 * @param parent the node being scanned.
 * @throws InvalidLocatorException is a location is not valid.
 * @see PageParser
 */
private void scanPage(EditorKit kit, WebNode parent)
                                                 throws InvalidLocatorException
  {
      if(giveUp()) return;

      Locator locator = parent.getLocator();

      printer.println(); printer.flush();
      printer.println("Scanning " + locator);
      /**
       * If we know that it is not a HTML document it is not necessary
       * to parse.
       */
      int type = locator.getContentType();
      if(type == locator.TYPE_NOT_HTML) {
	  if(ToolBox.debug) printer.print("\tNot HTML : " + locator);
	  return;
      }
      else if(locator.getDirDepth() > max_dir_depth)
	  throw new InvalidLocatorException("Too deep location "+max_dir_depth);

      Document doc  = kit.createDefaultDocument();
      doc.putProperty("IgnoreCharsetDirective", Boolean.TRUE);

      try {
	  Reader rd = locator.getReader();
	  kit.read(rd, doc, 0);
	  parent.setTitle((String)doc.getProperty(Document.TitleProperty));
      }
      catch(Exception ex) {
	  if(ToolBox.verbose) ToolBox.warn("Can't read " + locator);
	  return;
      }

      Element elem;
      ElementIterator it    = new ElementIterator(doc);
      final String FRAME    = HTML.Tag.FRAME.toString();

      /**
       * Examine each html element found in the document.
       * Manage only <frame src = > and <a href = >.
       */
      while((elem = it.next()) != null) {

	  AttributeSet set = elem.getAttributes();
	  if(set == null) continue;
	  String l = null;

	  if(elem.getName().equals(FRAME)) {
	      /**
	       * This element is a frame definition.
	       * The content of the frame is found
	       * following the link in src.
	       */
	      l = (String)set.getAttribute(HTML.Attribute.SRC);
	  }
	  else {
	      /**
	       * This element is a plain anchor.
	       * Href gives the link.
	       */
	      set = (AttributeSet)set.getAttribute(HTML.Tag.A);
	      if(set == null) continue;
	      l = (String)set.getAttribute(HTML.Attribute.HREF);
	  }

	  if(l != null) l = l.trim();
	  if(! Locator.isAcceptable(l)) continue; // Keep only file and http.

	  try {
	      Locator href    = Locator.makeLocator(locator, l);
	      if(ToolBox.debug)
		  printer.print("\tLink " + l);

	      if(! isServed(href)) { // Linked to a foreign document.
		  if(ToolBox.debug) printer.println(" outside");
		  continue;
	      }

	      String redirect = href.getRedirectedLocation();
	      if(redirect != null) {
		  href = Locator.makeLocator(redirect);
		  if(ToolBox.verbose)
		      printer.println(" redirected : " + href.toString());
	      }

	      l = href.getLocation();
	      if(isIgnored(l)) {
		  if(ToolBox.verbose)
		      printer.println(" ignored : " + href.toString());
		  continue;
	      }
	      else if(! href.isBelow(getTopDirURL())) {
		  if(ToolBox.verbose)
		      printer.println(" not below : " + href.toString());
		  continue;
	      }

	      /**
	       * A new node connected to this document
	       * has been found. It is kept if it is reachable.
	       */
	      int rc = href.getReadCode();
	      if(200 <= rc && rc < 300) {
		  if(ToolBox.debug)
		      printer.println(" new : " + href.toString());
	      }
	      else {
		  if(ToolBox.verbose)
		      printer.println(" unreadable : " + href.toString()
				      + " code : " + rc);
		  continue;
	      }

	      WebNode node = top.findNode(l);
	      if(node != null) {		// Already stored in the tree.
		  if(ToolBox.debug)
		      printer.println(" known : " + href.toString());
		  continue;
	      }

	      if(node_nbr > max_node_nbr)
		  throw new TooManyNodesException("Too many nodes " + node_nbr);

	      node = new WebNode(href);
	      /**
	       * If this node was found in a frame
	       * it is kept to be reprocessed later on.
	       */
	      if(elem.getName().equals(FRAME)) {
		  frames.add(node);
	      }

	      switch(scan_order) {

	      case WebNode.PREORDER    :
		  parent.add(node);
		  node_nbr++;
		  printer.println("\tAdded " + href.toString());
		  scanPage(kit, node); // Recursively call this method.
		  break;

	      case WebNode.POSTORDER   :

		  /**
		   * Checks if the new node is in this path of recursion.
		   */
		  int index = scan_path.indexOf(node);
		  if(index >= 0) continue; // Found, try another one.
		  if(parent.findNode(l) != null) continue;
		  
		  scan_path.add(node);   // Hansel und Graetel.
		  scanPage(kit, node);   // Recursively call this method.

		  parent.add(node);

		  node_nbr++;
		  printer.println("\tAdded : " + href.toString());
		  break;

	      default :
		  parent.add(node);
		  node_nbr++;
		  printer.println("\tAdded : " + href.toString());
		  break;
	      }
	  }
	  catch(InvalidLocatorException ex) {
	      ToolBox.warn("Can't make locator : " + l, ex);
	  }
      }

      if(scan_order == WebNode.BREADTH_FIRST) {
	  for(Enumeration en = parent.children(); en.hasMoreElements();) {
	      scanPage(kit, (WebNode)en.nextElement());
	  }
      }
  }
//______________________________________________________________________________
/**
 * Merges nodes corresponding to frames into their parents.
 * <br>
 * Nodes connected to the frames are attached to the parent.
 * The frame itself is discarded.
 */
final synchronized private void mergeFrames()
  {
      for (Iterator it = frames.iterator(); it.hasNext() ;) {

	  WebNode frame = (WebNode)it.next();
	  WebNode node  = (WebNode)frame.getParent();
	  int index     = node.getChildCount();
	  int nbr       = frame.getChildCount();

	  for(int i = 0; i < nbr; i++) {
	      WebNode child = (WebNode)frame.getFirstChild();
	      node.insert(child, index++);
	  }
	  node.remove(frame);
      }
  }
//______________________________________________________________________________
/**
 * Checks if a scan must be stopped.
 * @return true if the operation must be given up.
 * @see #giveUpScan().
 */
final synchronized private boolean giveUp()
  {
      return give_up;
  }
//______________________________________________________________________________
/**
 * Informs the site that the scan must be stopped.
 * @see #giveUp().
 */
final synchronized public void giveUpScan()
  {
      give_up = true;
  }
//______________________________________________________________________________
/**
 * Checks to see if a location must be ignored.
 * <P>
 * If one of the strings kept in the ignore list partly matches
 * the location it is ignored.
 *
 * @param location the path to test.
 * @return true if it matches one of strings of the list.
 */
final public boolean isIgnored(String location)
  {
      for (Iterator it = ignored.iterator(); it.hasNext() ;) {
	  if(location.indexOf((String)it.next()) >= 0)
	      return true;
      }

      return false;
  }
//______________________________________________________________________________
/**
 * Converts this site to String.
 * @return the string.
 */
public String toString()
  {
      StringBuffer buf = new StringBuffer("SiteScanner : ").append(getName());

      buf.append(" Server : ").append(top.getServerSpec());
      buf.append(" Number of pages : " + node_nbr);

      return buf.toString();
  }
//______________________________________________________________________________
/**
 * Returns the list of strings indicating which sites to ignore.
 * @return the list of strings.
 * @see #loadIgnoredList
 */
final protected String getIgnoredList()
  {
      return (ignored == null) ? null : ignored.toString();
  }
//______________________________________________________________________________
/**
 * Initializes the list of strings used to decide if a location must be ignored.
 * <P>
 * The list is iniatilized with some strings generated by Apache to display
 * directories,
 * the content of WebWader.Ignored is loaded, eventually
 * content of WebWader.sitename.Ignored is appended to the list.
 * @see #getIgnoredList
 * @see #loadIgnoredList(String)
 */
final private void loadIgnoredList()
  {
      loadIgnoredList(null);

      String list = "?N=D ?M=A ?S=A ?D=A";
      loadIgnoredList(list);

      list = Resources.instance().get(Resources.PREFIX + "Ignored");
      loadIgnoredList(list);
  }
//______________________________________________________________________________
/**
 * Initializes the list of strings used to decide if a location must be ignored.
 * <P>
 * It the given string is null or empty the list is reset.
 * @param list the list of space separated tokens.
 */
final private void loadIgnoredList(String list)
  {
    if(list == null || list.length() < 1) {
	if(ignored == null) ignored = new MiniSet();
	return;
    }

    StringTokenizer stk = new LineTokenizer(list);
    if(ignored == null) ignored = new MiniSet(stk.countTokens());

    for(int i = 0; stk.hasMoreTokens(); i++) {
	ignored.add(stk.nextToken());
    }
  }
//______________________________________________________________________________
/**
 * Checks to see if the locator is served by this site or not.
 *
 * @return true if it is provided by this server.
 * @see Locator#getServerSpec()
 */
final private boolean isServed(Locator locator)
  {
      if(top == null) return false;
      String spec = top.getServerSpec();
      return spec.equals(locator.getServerSpec());
  }
//______________________________________________________________________________
/**
 * Returns the url of the directory of the top node.
 * @return the url.
 */
final private String getTopDirURL()
    {
	if(top_dir_url != null) return top_dir_url;

	Locator l   = top.getLocator();
	top_dir_url = l.getSchemeServerSpec() + l.getParentPath();

	return top_dir_url;
    }
//______________________________________________________________________________
}

