###############################################################################
###############################################################################
#                               HtmlParser.tcl
###############################################################################
###############################################################################
# In this file are implemented the procedures used to parse Html file links.
###############################################################################
###############################################################################
# Copyright 2000 Andrs Garca Garca  -- fandom@retemail.es
# Distributed under the terms of the GPL v2
###############################################################################
###############################################################################
namespace eval HtmlParser {

###############################################################################
# SetEntities
#    Initializes the arrays with the translation for Html entities, something
#    like 'entity(lt)==>'
###############################################################################
proc SetEntities {} {
    variable entities

    set entities(quot)      \"
    set entities(amp)       \\&
    set entities(lt)        <
    set entities(gt)        >
    set entities(nbsp)      {}
    set entities(iexcl)     
    set entities(cent)      
    set entities(pound)     
    set entities(curren)    
    set entities(yen)       
    set entities(brvbar)    \|
    set entities(sect)      
    set entities(uml)       
    set entities(copy)      
    set entities(ordf)      
    set entities(laquo)     
    set entities(not)       
    set entities(shy)       
    set entities(reg)       
    set entities(macr)      
    set entities(deg)       
    set entities(plusmn)    
    set entities(sup2)      
    set entities(sup3)      
    set entities(acute)     
    set entities(micro)     
    set entities(para)      
    set entities(middot)    
    set entities(cedil)     
    set entities(sup1)      
    set entities(ordm)      
    set entities(raquo)     
    set entities(frac14)    
    set entities(frac12)    

    set entities(frac34)    
    set entities(iquest)    
    set entities(ntilde)    
    set entities(Agrave)    
    set entities(Aacute)    
    set entities(Acirc)     
    set entities(Atilde)    
    set entities(Auml)      
    set entities(Aring)     
    set entities(AElig)     
    set entities(Ccedil)    
    set entities(Egrave)    
    set entities(Eacute)    
    set entities(Ecirc)     
    set entities(Euml)      
    set entities(Igrave)    
    set entities(Iacute)    
    set entities(Icirc)     
    set entities(Iuml)      
    set entities(ETH)       
    set entities(Ntilde)    
    set entities(Ograve)    
    set entities(Oacute)    
    set entities(Ocirc)     
    set entities(Otilde)    
    set entities(Ouml)      
    set entities(times)     
    set entities(Oslash)    
    set entities(Ugrave)    
    set entities(Uacute)    
    set entities(Ucirc)     
    set entities(Uuml)      
    set entities(Yacute)    
    set entities(THORN)     
    set entities(szlig)     
    set entities(agrave)    
    set entities(aacute)    
    set entities(acirc)     
    set entities(atilde)    
    set entities(auml)      
    set entities(aring)     
    set entities(aelig)     
    set entities(ccedil)    
    set entities(egrave)    
    set entities(eacute)    
    set entities(ecirc)     
    set entities(euml)      
    set entities(igrave)    
    set entities(iacute)    
    set entities(icirc)     
    set entities(iuml)      
    set entities(eth)       
    set entities(ntilde)    
    set entities(ograve)    
    set entities(oacute)    
    set entities(ocirc)     
    set entities(otilde)    
    set entities(ouml)      
    set entities(divide)    
    set entities(oslash)    
    set entities(ugrave)    
    set entities(uacute)    
    set entities(ucirc)     
    set entities(uuml)      
    set entities(yacute)    
    set entities(thorn)     
    set entities(yuml)      

    return
}

###############################################################################
# ShowLinks
#    Show the links found in the last preprocessed page, it's only good for
#    debugging.
###############################################################################
proc ShowLinks {} {
    variable nLinks
    variable links

    for {set i 1} {$i<$nLinks} {incr i} {
        set description [TidyDescription $links($i,descrip) $links($i,url)]
        if {[info exists links($i,type)]} {
            puts "\n$i: $links($i,file) - Bajar: $links($i,ok)"
            puts "$description - $links($i,type)"
            puts "$links($i,url)"
        } else {
            puts "\n$i: $links($i,file) - Bajar: $links($i,ok)"
            puts "$description"
            puts "$links($i,url)"
        }
    }
    return
}

###############################################################################
# ParseUrl
#    Given an url 'ParseUrl' will split it in its parts: protocol, domain,
#    directory and filename
#
# Parameter
#    The url to be parsed,
#
# Returns
#    A list with the url split as mentioned above or '1' if the url couldn't be
#    parsed.
###############################################################################
proc ParseUrl {url} {

    if {[regexp -nocase \
            {(([^:]*)(?:://))?([^/]+)(((?:~[^/]*)?(?:[^\?]*))(?:/)([^#]*))?} \
            $url nada nada protocol domain nada dir fileName]} {
        return [list $protocol $domain $dir $fileName]
    }

    return 1
}

###############################################################################
# TidyDir
#    Takes things like ".." and "." from the absolute path.
#
# Parameter:
#    File path.
#
# Returns:
#    The tidied file path.
###############################################################################
proc TidyDir {path} {
    if {[regexp {\.$} $path]} {
        append path /
    }
    for {set a 1 ; set b 1} {($a>0)||($b>0)} {} {
        set a [regsub -all {/\./} $path {/} path]
        set b [regsub -all {([^./]+/\.\./)} $path {} path]
    }
    for {} {[regsub {^/\.\.} $path {} path]} {} {}

    return $path
}

###############################################################################
# RemoveEntities
#    Given a link or a link description, this procecedure subtitutes the
#    Html character entities for the real thing, for example '&amp;' gets
#    changed to '&'.
#
# Parameter
#    The string to process.
#
# Returns
#    The string processed.
##############################################################################
proc RemoveEntities {string} {
    variable entities

    while {[regexp {(?:&)([^ ;]+)(;)?} $string old entity]} {
        regsub {#} $entity {} entity
        # Eventually this should be replaced with "string is number"
        if {[regexp {^[0-9]+$} $entity]} {
            if {[catch {format %c $entity} new]} {
                break
            }
            regsub -all {([\\])} $new {\\\1} new
        } else {
            if {[catch {set ::HtmlParser::entities($entity)} new]} {
                break
            }
        }
        if {$new=="&"} {
            set new "\\&"
        }
        regsub -all $old $string $new string
    }
    return $string
}

###############################################################################
# TidyLinks
#     Removes Html character entities from the links. It seems that if a
#     file is, for example, called 'me&you.jpg' some webmasters or Html editors
#     will put 'me&amp;you.jpg' in the link.
#
# Side efects
#     The links in 'links' will contain no character entities
################################################################################
proc TidyLinks {} {
    variable nLinks
    variable links

    for {set i 1} {$i<$nLinks} {incr i} {
        set links($i,file) [RemoveEntities $links($i,file)]
    }
    return
}

###############################################################################
# TidyDescription
#    Translates for human eyes the description of the links.
#
# Parameters:
#    description: The description to be translated.
#    url: The url for the translation
#
# Returns:
#    The description translated.
###############################################################################
proc TidyDescription {description url} {
    global labelDialogs
    variable entities
    variable nLinks
    variable links

    if {[regexp {^[\s]*$} $description]} {
        return $url
    }
    if {[regexp -nocase {(<img)} $description]} {
        regsub -all {<.*?>} $description {} tmp
	    if {![regexp {^\s*$} $tmp]} {
            set description $tmp
        } else {
            set description [GetFileName $description alt]
            if {$description==1} {
                set description $url
            }
            set description "$labelDialogs(linkImage): $description"
        }
    }
    regsub -all {<.*?>} $description {} description

    set description [RemoveEntities $description]
    regsub -all {\s+} $description { } description
    regsub      {^\s} $description {}  description

    if {$description==""} {
        return $url
    }
    return $description
}

###############################################################################
# ChangeEncoding
#    Changes the encoding in which the description of the links are written
#
# Parameters:
#    newEncoding: The encoding to use.
###############################################################################
proc ChangeEncoding {newEncoding} {
    variable nLinks
    variable links
    global dirGetleft

    for {set i 1} {$i<$nLinks} {incr i} {
        set links($i,descrip) \
                [encoding convertfrom "$newEncoding" $links($i,descrip)]
    }

    return
}

###############################################################################
# GetFileName
#    Extrac the filename of the link, or the description in the 'alt' field
#    from whatever it gets send.
#
# Parameter
#    tag: the string to process.
#    type: what we are looking for: "href", "src" or "alt".
#
# Returns:
#    The filename or '1' if none was found.
###############################################################################
proc GetFileName {tag type} {

    regsub {\s*=\s*} $tag {=} tag
    if {[regexp -nocase -expanded [subst -nocommand {
            (?:$type=) 
            (?:(?:(?:\")([^\"]+))| # Filename between ""
            (?:(?:\')([^\']+))|    # Between ''
            ([^\ \"']+))           # No delimiter
    }] $tag nada a b c]} {
        if {$a!=""} {
            set fileName $a
        } elseif {$b!=""} {
            set fileName $b
        } elseif {$c!=""} {
            set fileName $c
        }
        # It seems you can write a link as //site.com/file.html
        regsub {^//} $fileName {http://} fileName
        # Workaround for Webmasters that don't know the directory separator 
        # is a single / or feel like using the windows one \.
        regsub -all {([^:]//)|(\\)} $fileName {/} fileName
        # The following is due to Javascript variables.
        if {[regexp {^(&)(.*)(;)$} $fileName]} {
            return 1
        }
        if {[regexp -nocase {^javascript:} $fileName]} {
            return 1
        }
        return $fileName
    }
    if {[regexp -nocase [subst -nocommand {(?:$type=)((\"\")|(\'\'))}] $tag]} {
        return ""
    }
    return 1
}

###############################################################################
# CompleteString
#    Reads from the channel 'leer' until the 'cosa' string includes the
#    substring passed as a parameter.
#
# Parameters:
#    cadena: name of the variable with the string to complete.
#    leer: channel to read from.
#    pattern: substring to look for in the channel.
#
# Returns:
#    - '0': No errors.
#    - '1': The string could not be completed.
#
# Side efects:
#    'cosa' is completed.
###############################################################################
proc CompleteString {cadena leer pattern} {

    upvar $cadena cosa
    for {set tmp [gets $leer]} {![eof $leer]} {set tmp [gets $leer]} {
        append cosa " " $tmp
        if {[regexp -nocase "$pattern" $tmp]} {
            return 0
        }
    }
    return 1
}

###############################################################################
# Parsing
#    Reads the Web page passed as a parameter and proccess it to extract
#    all links.
#
# Parameters:
#    file: File which contains the page to process.
#    referer: The referer link for the page.
#
# Returns:
#    - '0': No errors.
#    - '1': Couldn't open file.
#
# Side efects:
#    'nLinks': number of links plus one.
#    'links' : keeps all the info about the links.
###############################################################################
proc Parsing {file referer} {
    global   labelDialogs labelTitles
    variable nLinks
    variable links
    variable pageEncoding
    variable baseTag

    if {[string match $file ""]} return

    set nLinks  1
    catch {unset links}

    if {[catch {open $file r} leer]} {
        return 1
    }
    catch {unset linkType}
    set pageEncoding ""

    for {set cosa [gets $leer] ; set thumbnailNext 0 ; set newBase "" ; set baseTag ""} \
            {(![eof $leer]) || ([string compare $cosa ""])} \
            {if {![eof $leer]} {append cosa " " [gets $leer] " "} } {
        if {![regexp {(<)(.*)} $cosa cosa]} {
            set cosa ""
            continue
        }
        if {![regexp -nocase {(<)(a|b|l|i|f|s|m)(.*)} $cosa cosa]} {
            set cosa ""
            continue
        }
        if {![regexp {>} $cosa]} {
            set result [CompleteString cosa $leer ">"]
            if {$result==1} break
        }
        regexp {(?:<)([^>]*)(?:>)(.*)} $cosa nada tag cosa
        set lowerTag [string tolower $tag]

        if {![regexp {^(a.+h|link|img|frame|area|form|script|meta|base)} $lowerTag ]} {
            continue
        }
        set fileName 1
        switch -regexp $lowerTag {
            ^meta {
                regexp {(?:meta *)(?:charset=)(?:\"|')?([^\"' ]*)} $tag \
                        nada pageEncoding]
            }
            ^script {
                if {![regexp -nocase {/script>} $cosa]} {
                    set result [CompleteString cosa $leer "/script>"]
                    if {$result==1} break
                }
                regexp -nocase {(?:/script>)(.*)} $cosa nada cosa
                if {[set fileName [GetFileName $tag src]]!=1} {
                    set links($nLinks,descrip) "Script: $fileName"
                }
            }
            "^area" {
                if {[set fileName [GetFileName $tag href]]!=1} {
                    if {[set descrip [GetFileName $tag alt]]!=1} {
                        set links($nLinks,descrip) $descrip
                    } else {
                        set links($nLinks,descrip) "$labelDialogs(map)"
                    }
                }
            }
            "^a.+href" {
                if {[set fileName [GetFileName $tag href]]!=1} {
                    if {[regexp -nocase {^mailto:|^news:} $fileName]} continue
                    if {[regexp -nocase {(javascript)(.*)(\()} $fileName]} continue
                    if {![regexp -nocase {(</a)} $cosa]} {
                        set result [CompleteString cosa $leer "</a"]
                        if {$result==1} break
                    }
                    regexp -nocase {(.*?)(?:</a)} $cosa nada descripcion
                    set links($nLinks,descrip) $descripcion
                    if {[string match $descripcion ""]} {
                        set fileName 1
                    }
                    if {[regexp -nocase {<img[^>]* src} $descripcion]} {
                        set thumbnailNext 1
                    }
                }
            }
            "^link +href" {
                if {[set fileName [GetFileName $tag href]]!=1} {
                    set links($nLinks,descrip) "$labelDialogs(css)"
                }
            }
            "^frame" {
                if {[set fileName [GetFileName $tag src]]!=1} {
                    set links($nLinks,descrip) "$labelDialogs(frame): $fileName"
                }
            }
            "^img" {
                if {[set fileName [GetFileName $tag src]]!=1} {
                    if {[set descrip [GetFileName $tag alt]]==1} {
                        set descrip $fileName
                    }
                    set links($nLinks,descrip) "$labelDialogs(image): $descrip"
                    if {$thumbnailNext==0} {
                        set links($nLinks,type) image
                    } else {
                        set links($nLinks,type) thumb
                        set thumbnailNext 0
                    }
                }
            }
            "^base" {
                if {[set fileName [GetFileName $tag href]]!=1} {
		            set baseTag $tag
                    set newBase [CompleteUrl $fileName $referer ""]
                    set fileName 1
                }
            }
        }
        if {$fileName!=1} {
            set newName $fileName
            if {![regexp {^#} $fileName]} {
                set links($nLinks,file) $fileName
                set links($nLinks,url)  [CompleteUrl $fileName $referer $newBase]
                incr nLinks
            }
        }
    }
    close $leer
 
    return 0
}

###############################################################################
# FilterLinks
#    Filters the links extracted from a page according to the rules given.
#
# Parameters
#   referer: The url of the page we got the links from.
#   linkArray: Name of the array where the links are stored
#
# Side effects:
#    'nLinks' and 'links' are upated to the new, filtered links
###############################################################################
proc FilterLinks {referer linkArray} {
    global downOptions filesDone siteUrl directories

    upvar #0 $linkArray links

    set baseSite $siteUrl(www)
	regexp {(.*)(:)} $siteUrl(www)  nada baseSite

    for {set i 1} {[info exists links($i,url)]} {incr i} {
        set link $links($i,url)
		set links($i,ok) 1

        if {([regexp {\.ram$} $link])} {
            set links($i,ok) 0
            continue
        }
        if {($downOptions(filter)!="")&&([regexp -nocase "$downOptions(filter)" \
                $links($i,file)])} {
            set links($i,ok) 0
            continue
        }
        if {([regexp {\?} $link])&&($downOptions(cgi)==0)} {
            set links($i,ok) 0
            continue
        }
        if {[info exists filesDone([::Commands::TidyNames \
                [TidyDir [file join $directories(base)$siteUrl(dir) $link]]])]} {
            set links($i,ok) 0
            continue
        }
        if {[regexp {^ftp:.*/$} $link]} {
            set links($i,ok) 0
            continue
        }
        if {[regexp -nocase {^https://} $link]} {
            set links($i,ok) 0
            continue
        }
        set parsedUrl     [ParseUrl $link]
        set protocol      [lindex $parsedUrl 0]
        set direccion_www [lindex $parsedUrl 1]
        set directory     [lindex $parsedUrl 2]
        if {![info exists direccion_www]} {
            set links($i,ok) 0 ; # Maybe something should be put in the error \
                                   log about this.
            continue
        }
        # www.domain.com and www.domain.com:8080 will be considered the
        # same site.
		set linkSite $direccion_www
		regexp {(.*)(:)} $direccion_www nada linkSite
        if {[string compare [string tolower $baseSite] \
                    [string tolower $linkSite]]} {
            if {$downOptions(external)==0} {
	           set links($i,ok) 0
                continue
            }
            set externalLink 1
        } else {
            set externalLink 0
        }
        if {($downOptions(dir)==0)&&($siteUrl(base)!="")&&($externalLink==0)} {
            if {![regexp -nocase "^$siteUrl(base)" $directory]} {
				set links($i,ok) 0
                continue
            }
        }
        set remove 0

        catch {
            if {($links([expr {$i+1}],type)=="thumb")&&($downOptions(images)==1)} {
                if {[regexp -nocase {(gif$)|(jpg$)|(bmp$)|(xbm$)|(tiff$)|(png$)}\
						$links($i,file)]} {
                    set remove 1
                }
            }

        }
        catch {
            if {($links($i,type)=="thumb")&&($downOptions(images)==2)} {
                set remove 1
            }
        }
        if {$remove==1} {
            set links($i,ok) 0
            continue
        }
    }
    return
}

###############################################################################
# CompleteUrl
#    Given a link, this procedure returns the full Url of that link, for
#    example, a link from a page may be '../index.html', this procedure
#    will return something like 'http://www.algo.es/cosas/index.html'
#
# Parameter
#    link: I'll let you guess
#    referer: url of the referrer page for the link
#    newBase: In case the page contains a 'BASE' tag, this will have the
#             url to use as base for the links.
#
# Returns
#    The url
###############################################################################
proc CompleteUrl {link referer newBase} {
    global siteUrl

    set link [RemoveEntities $link]
    if {[regexp {://} $link]} {
        # cgi links may have a http:// and still be relative
        if {![regexp {(\?)(.*)(://)} $link]} {
            return $link
        }
    }

    if {$newBase==""} {
        set parsedUrl [ParseUrl $referer]
    } else {
        set parsedUrl [ParseUrl $newBase]
    }	    
    set prot   [lindex $parsedUrl 0]
    set domain [lindex $parsedUrl 1]
    set dir    [lindex $parsedUrl 2]

    if {[regexp {(?::/)([^/].*)} $link nada fileName]} {
        set url $prot://$domain/$fileName
        return $url
    }
    if {[regexp {^/} $link]} {
        set url $prot://$domain$link
        return $url
    }
    set fileName [TidyDir $dir/$link]
    set url "$prot://$domain$fileName"
    return $url
}

SetEntities

}
