#!/usr/local/bin/python

# CollectingParser:
# Defines two interrelated classes which parse an HTML document and extract
# relevant information about it.
#
# This code copyright 1996 by Tessa Lau, Seattle, WA.
# tlau@cs.washington.edu
#
# This is free software; unrestricted redistribution is allowed under the
# terms of the GPL.  For full details of the license conditions of this
# software, see the GNU General Public License.
#
# HTMLDocument:
# Stores information about a document.
# Methods:
# get_url()
#     returns the base url of this document, if a <base href="..."> tag
#     was present in the document; the empty string, otherwise.
# get_title()
#     returns the document title
# get_words()
#     Returns the non-tagged words in the document, including anchor text.
#     This list is lowercased and filtered through a set of stop
#     words---though this behavior is easy to change.
# get_uniq_words()
#     Like get_words, only returns only the unique words in the dociment.
# get_images()
#     Returns a list of images; each element in the list is the url to the
#     image.
# get_anchors()
#     Returns a list of (url, link-text) pairs, one for each link in the
#     document.
# get_images_abs()
# get_anchors_abs()
#     Returns the same as the above, only uses the base url to make all
#     links absolute.
# get_size()
#     Returns the page size in bytes.
# get_tables()
# get_frames()
# get_java()
# get_isindex()
# get_forms()
#     Returns the number of times each tag appeared in the doc.
# print_attrs()
#     Print all the data we have about this doc to stdout.
#
# CollectingParser:
# Does the actual parsing of HTML.
# Methods:  (see the sgmllib.SGMLParser interface for more details)
# CollectingParser()
#     Create an instance of a CollectingParser.
# feed_document(data, url=None)
#     If data is an HTML document, feed_document() causes the parser to
#     parse the data and return the HTMLDocument object which describes
#     that document.  If url is given, the base url of the HTMLDocument
#     object will be set to this.  If the base tag is encountered in the
#     HTML text, it will override the url passed to this function.
#
# See the test function below for an example on how to use these classes.

from sgmllib import SGMLParser
import sys, string, regsub, regex
import urlparse

# My collection of stopwords---please replace it
from Stopwords import stopwords

def uniques(list):
	a = {}
	for x in list:
		a[x] = None
	return a.keys()

class HTMLDocument:
	def __init__(self):
		self.url = ""
		self.title = ""
		self.tables = 0
		self.frames = 0
		self.java = 0
		self.isindex = 0
		self.forms = 0
		self.size = 0
		self.fgcolor = ""
		self.linkcolor = ""
		self.bgcolor = ""
		self.bgimage = 0
		self.images = []
		self.links = []
		self.words = []
	
	# All the words in the doc, lowercased and filtered for <= 2-letter
	# words and stopwords by the CollectingParser
	# The words argument is a list of words
	def add_words(self, words):
		self.words = self.words + words
	def get_words(self):
		return self.words
	def get_uniq_words(self):
		return uniques(self.words)

	# HTML tables
	def set_tables(self):
		self.tables = self.tables + 1
	def get_tables(self):
		return self.tables

	# HTML frames
	def set_frames(self):
		self.frames = self.frames + 1
	def get_frames(self):
		return self.frames

	# Java applets
	def set_java(self):
		self.java = self.java + 1
	def get_java(self):
		return self.java

	# The base url
	def set_url(self, newurl):
		self.url = string.strip(newurl)
	def get_url(self):
		return self.url

	# The document title
	def set_title(self, newtitle):
		# Change all whitespace to spaces
		self.title = string.join(string.split(newtitle))
	def get_title(self):
		return self.title

	# Forms and isindex tags
	def set_isindex(self):
		self.isindex = self.isindex + 1
	def get_isindex(self):
		return self.isindex
	def set_forms(self):
		self.forms = self.forms + 1
	def get_forms(self):
		return self.forms

	# Inlined images
	def set_images(self, url):
		self.images.append(url)
	def get_images(self):
		return self.images
	def get_images_abs(self):
		if self.url:
			return map(lambda x, base=self.url: urlparse.urljoin(base,
				x), self.images)
		else:
			return self.images

	# Anchors
	def set_anchors(self, url, text):
		self.links.append(url, string.join(string.split(text)))
	def get_anchors(self):
		return self.links
	def get_anchors_abs(self):
		if self.url:
			return map(lambda x, base=self.url: (urlparse.urljoin(base,
				x[0]), x[1]), self.links)
		else:
			return self.links

	# Page size
	def set_pagesize(self, size):
		self.size = size
	def get_pagesize(self):
		return self.size

	# Fore/Background color/images
	def set_fgcolor(self, color):
		r = regex.compile('#[0-9a-fA-F]*')
		i = r.match(color)
		if i != -1:
			self.fgcolor = color[:i]
	def get_fgcolor(self):
		return self.fgcolor
	def set_linkcolor(self, color):
		r = regex.compile('#[0-9a-fA-F]*')
		i = r.match(color)
		if i != -1:
			self.linkcolor = color[:i]
	def get_linkcolor(self):
		return self.linkcolor
	def set_bgcolor(self, color):
		r = regex.compile('#[0-9a-fA-F]*')
		i = r.match(color)
		if i != -1:
			self.bgcolor = color[:i]
	def get_bgcolor(self):
		return self.bgcolor
	def set_bgimage(self):
		self.bgimage = 1
	def get_bgimage(self):
		return self.bgimage

	def print_attrs(self):
		print "URL:", self.url
		print "Title:", self.title
		print "Size:", self.size
		print "Tables:", self.tables
		print "Frames:", self.frames
		print "Java:", self.java
		print "Forms:", self.forms
		print "Isindex:", self.isindex
		print "Foregrond color:", self.fgcolor
		print "Link color:", self.linkcolor
		print "Background color:", self.bgcolor
		print "Background image:", self.bgimage
		print "Images:"
		for img in self.get_images_abs():
			print '\t', img
		print "Links:"
		for (link, text) in self.get_anchors_abs():
			print '\t', link, '(' + text + ')'
		print "Words:", string.joinfields(self.get_words(), ",")

class CollectingParser(SGMLParser):
	from htmlentitydefs import entitydefs

	def __init__(self):
		self.savedata = None
		self.anchor = None
		SGMLParser.__init__(self)

	def feed_document(self, data, url=None):
		# Initialize our state
		self.doc = HTMLDocument()
		self.doc.set_pagesize(len(data))
		# Initialize a base url; will be overridden if there's a 
		# base tag
		if url:
			self.doc.set_url(url)
		self.savedata = None
		self.anchor = None
		self.text = ""
	
		# Do the parsing and return the doc
		self.feed(data)
		self.close()
		self.doc.add_words(self.parse_words(self.text))
		return self.doc

	def handle_data(self, data):
		# If we're in the middle of a region being saved, save it
		if self.savedata is not None:
			self.savedata = self.savedata + data
		# Otherwise we're just adding some normal text
		self.text = self.text + data

	def parse_words(self, data):
		words = regsub.split(string.lower(data), "[^a-zA-Z0-9_]")
		outwords = []
		for word in words:
			if len(word) > 2 and len(word) <= 40 and \
				word not in stopwords and \
				regex.match("^[0-9]+$", word) == -1:
				# filter out all-numeric words
				outwords.append(word)
		return outwords

	def save_bgn(self):
		self.savedata = ''

	def save_end(self):
		data = self.savedata
		self.savedata = None
		return data

	def start_table(self, attrs):
		self.doc.set_tables()
	def end_table(self):
		pass

	def do_frame(self, attrs):
		self.doc.set_frames()

	def start_applet(self, attrs):
		self.doc.set_java()
	def end_applet(self):
		pass

	def start_title(self, attrs):
		self.save_bgn()
	def end_title(self):
		if not self.savedata:
			return
		self.doc.set_title(self.save_end())

	def do_isindex(self, attrs):
		self.doc.set_isindex()

	def start_form(self, attrs):
		self.doc.set_forms()
	def end_form(self):
		pass

	def start_base(self, attrs):
		for attr, val in attrs:
			if attr == 'href':
				self.doc.set_url(val)
	def end_base(self):
		pass

	def start_img(self, attrs):
		for attr, val in attrs:
			if attr == 'src':
				self.doc.set_images(val)
	def end_img(self):
		pass

	def start_a(self, attrs):
		for attr, val in attrs:
			val = string.strip(val)
			if attr == 'href':
				self.anchor = val
		self.save_bgn()
	def end_a(self):
		if not self.anchor:
			# Yikes, must be bad html
			return
		href = self.anchor
		self.anchor = None
		text = self.save_end()
		self.doc.set_anchors(href, text)

	def do_br(self, attrs):
		self.handle_data("\n")

	def do_body(self, attrs):
		for attr, val in attrs:
			val = string.strip(val)
			if attr == 'text':
				self.doc.set_fgcolor(val)
			if attr == 'link':
				self.doc.set_linkcolor(val)
			if attr == 'bgcolor':
				self.doc.set_bgcolor(val)
			if attr == 'background':
				self.doc.set_bgimage()

	def start_td(self, attrs):
		pass
	def end_td(self):
		self.handle_data(" ")

	# Return the HTMLDocument we populated during our run
	def document(self):
		return self.doc

def test():
	if len(sys.argv) != 2:
		print "Please specify a URL on the command line."
		sys.exit()
	url = sys.argv[1]
	import urllib
	fo = urllib.urlopen(url)
	data = fo.read()
	fo.close()
	p = CollectingParser()
	d = p.feed_document(data, url)
	d.print_attrs()

if __name__ == '__main__':
	test()
