## HTML parser : search for url

from string   import index, index_error
from sgmllib  import SGMLParser

from utils    import *
from urlparse import urlparse

class HIParser(SGMLParser):

	def __init__(self, globale):
		SGMLParser.__init__(self)
		self.anchorList = []
		self.nbDone = 0
		self.head = 0
		self.globale = globale

		self.i = 0
		self.l = 0

	def parse_starttag(self, i):
		self.i = i
		return SGMLParser.parse_starttag(self, i)

	def search_url_param(self, attrs, param):
		value = self.search_param(attrs, param)
		if value: self.globale.handle_url(value, param)

	def search_param(self, attrs, param):
		for attrname, value in attrs:
			if attrname==param: return value
		return None

    # --- <A> 
	def start_a(self, attrs):
		self.search_url_param(attrs, 'href')
    
	# --- <LINK>
	def do_link(self, attrs):
		self.search_url_param(attrs, 'href')
		
	# --- <AREA>
	def do_area(self, attrs):
		self.search_url_param(attrs, 'href')

	# --- <IMG>
	def do_img(self, attrs):
		self.search_url_param(attrs, 'src')
		self.search_url_param(attrs, 'longdesc')

    # --- <BODY>
	def start_body(self, attrs):
		self.search_url_param(attrs, 'background')
		
	# --- <INPUT>
	def do_input(self, attrs):
		self.search_url_param(attrs, 'src')
		
	# --- <SCRIPT>
	def start_script(self, attrs):
		self.search_url_param(attrs, 'src')
		
	# --- <FRAME>
	def do_frame(self, attrs):
		self.search_url_param(attrs, 'src')
		self.search_url_param(attrs, 'longdesc')
		
	# --- <IFRAME>
	def start_frame(self, attrs):
		self.search_url_param(attrs, 'src')
		slef.search_url_param(attrs, 'longdesc')
		
	# --- <HEAD>
	# we set a flag : to see if an eventual <BASE> tag is really in HEAD 
	# (some kind of paranoia)
	def start_head(self, attrs):
		self.head = 1
		
	def end_head(self):
		self.head = 0
	
	# --- <BASE>	
	def do_base(self, attrs):
		if not self.head: return   # BASE outside HEAD (some bad HTML !)
		value = self.search_param(attrs, 'href')
		if not value: return       # more bad HTML
		self.globale.handle_base(value)

	## List functions
	def increment_done(self):
		self.nbDone = self.nbDone + 1

	def current_todo(self):
		if self.nbDone==len(self.anchorList): return None
		else: return self.anchorList[self.nbDone]

	def modify_current_todo(self, newAnchor):
		self.anchorList[self.nbDone] = newAnchor

	def append_todo(self, url):
		fp = path_from_url(url)
		# check if not already done or in todo
		for l in self.anchorList:
			if fp==l: return 'already in list'
		self.anchorList.append(fp)
		return 'new anchor (+1)'
