#!/usr/bin/bash
#
# -----------------------------------------------------------------------------
#
# A license is hereby granted to reproduce this software source code and
# to create executable versions from this source code for personal,
# non-commercial use.  The copyright notice included with the software
# must be maintained in all copies produced.
#
# THIS PROGRAM IS PROVIDED "AS IS". THE AUTHOR PROVIDES NO WARRANTIES
# WHATSOEVER, EXPRESSED OR IMPLIED, INCLUDING WARRANTIES OF
# MERCHANTABILITY, TITLE, OR FITNESS FOR ANY PARTICULAR PURPOSE.  THE
# AUTHOR DOES NOT WARRANT THAT USE OF THIS PROGRAM DOES NOT INFRINGE THE
# INTELLECTUAL PROPERTY RIGHTS OF ANY THIRD PARTY IN ANY COUNTRY.
#
# Copyright (c) 1995, 1996, 1997, 1998 John Conover, All Rights Reserved.
#
# Comments and/or bug reports should be addressed to:
#
#     john@johncon.com (John Conover)
#
# -----------------------------------------------------------------------------
#
# wgetrel search Internet Web pages for documents that are relevant to a
# search criteria
#
# wgetrel [-l N] [-n N] criteria <htmlfile | url1 url2 ...>
#
#     -l N, search to a depth of N many links.
#     -n N, output a maximum of N many http descriptors.
#
# Wgetrel is an Internet Web page search engine, using the programs
# htmlrel(1) and wget(1). (The program htmlrel(1) is available via
# anonymous ftp from
# ftp://sunsite.unc.edu/pub/Linux/utils/text/rel*.tar.gz, and the
# program wget(1) is available via anonymous ftp from
# ftp://prep.ai.mit.edu/pub/gnu/wget.tar.gz.) The direction of the
# search is controlled through determination of relevance of the
# documents to a search criteria.
#
# See the man page for htmlrel(1) for additional information on the
# syntax of the criteria argument-the boolean operators supported are
# logical or, logical and, and logical not.  These operators are
# represented by the symbols, "|", "&", and, "!", respectively, and left
# and right parenthesis, "(" and ")", are used as the grouping
# operators.
#
# The documents searched are stored in a directory, www, residing in the
# directory where wgetrel(1) was invoked. The output of the wgetrel(1)
# script is a file, www.html, in the directory where the wgetrel(1)
# script was invoked. The structure and syntax of the file is compatible
# with the Netscape level 1 bookmark file specification-it can be
# browsed with Netscape, Mosaic, or Lynx, for example:
#
#     lynx www.html
#     Mosaic www.html
#     Netscape www.html
#
# EXAMPLE USAGE
#
#     wgetrel mycriteria http://my.favorite.url
#     wgetrel mycriteria www.html
#         .
#         .
#         .
#     wgetrel mycriteria www.html
#     netscape www.html
#
# SEARCH STRATEGIES
#
# The shell script wgetrel(1) uses the wget(1) program in conjunction
# with htmlrel(1) to provide a flexible and extensible Internet HTML Web
# page search tool. The script may be altered to optimize search
# strategies. One of the advantages of relevance searching on the
# Internet is that the the search of HTML links can be controlled by the
# relevance of information contained in the HTML pages. This can be an
# iterative process, for example:
#
#     wget -nc -r -l1 -H -x -t3 -T15 -P www -Ahtml,htm http://my.favorite.url
#
# would "seed" the html page directory, www, with pages from the URL,
# http://my.favorite.url. Note that a search "context" has already been
# specified; doing a search, for example on game theory, by specifying a
# keyword of "game" to an Internet search engine would not produce the
# desired results. However, if the URL, http://my.favorite.url, was the
# Web pages for an economics department at a university, the "context"
# would be entirely different.
#
# The next iteration of the search, going down another level in the
# hierarchy of the links might be:
#
#     cd www
#     htmlrel criteria * > ../www.html
#     cd ..
#
# and the search iterated:
#
#     wget -nc -r -l1 -H -x -t3 -T15 -P www -Ahtml,htm -i www.html
#
# where the file www.html is a list of the URL's containing information,
# in order of relevance, as specified in the criteria arguments to
# htmlrel(1).  Since the URL's are ordered by relevance, the most
# "promising," (ie., the documents with the best probability of
# containing the information that is being searched for,) the file,
# www.html, can be trimmed, say, to 10, URL's:
#
#     cd www
#     htmlrel -n 10 criteria * > ../www.html
#     cd ..
#
# and the search iterated:
#
#     wget -nc -r -l1 -H -x -t3 -T15 -P www -Ahtml,htm -i
#     www.html
#
# which would descend the search another level in the link hierarchy
# from http://my.favorite.url.
#
# Alternatively, the file, www.html, can be edited, and reordered, (in
# an interactive fashion with each search,) with any popular browser to
# enhance the search direction and capability. Note that the search
# criteria can be altered in the process, and, since the Web pages are
# stored on the local machine, can be viewed, "off line."  Note, also,
# that the programs wget(1) and htmlrel(1) are "portable," so the actual
# search can use a host that has a direct high speed connection to the
# Internet-and the file, www.html, transfered back to the local machine.
#
# One of the issues in searching the Internet, is that the the number of
# HTTP links that need to be searched increases exponentially with the
# number of HTTP pages that have already been searched-if the number of
# pages in the directory, www, are increasing exponentially, it is
# probably appropriate to constrain the search through alteration of the
# search criteria used for htmlrel(1). (There are about three links, on
# average, on every HTML page.)
#
# For exhaustive searches, the depth, (the -l argument to both wget(1)
# and htmlrel(1),) can be increased. For general searching, a depth of 3
# will usually suffice, and only one iteration will be
# required. Typically, this will reduce the search time for specific
# information by approximately an order of magnitude.
#
# The programs (NOTE: this script uses bash(1) syntax-bash must be used):
#
CD="cd" # the cd(1) command, probably executed in the shell
ECHO="echo" # the echo(1) command, probably executed in the shell
MKDIR="mkdir" # the mkdir(1) command, probably executed in the shell
HTMLREL="/usr/local/bin/htmlrel" # the htmlrel(1) program, available via anonymous ftp to sunsite.unc.edu in /pub/Linux/utils/text
WGET="/usr/local/bin/wget" # the wget(1) program, available via anonymous ftp to prep.ai.mit.edu in /pub/gnu
#
# Directory:
#
WWW="www" # directory where html pages are stored
#
# Return value:
#
retval="0" # assume no error
#
# Default search depth:
#
DEPTH="2"
#
# Default number, (which must be a large integer,) of URL's in output file:
#
NUMBER="2147483647"
#
if "${MKDIR}" -p "${WWW}" # make sure the directory where the html pages are stored exists
then
    while [ "$#" -gt "0" ] # check for command line options
    do
        case "${1}" in
            -l) shift # request for search depth?
                if [ "$#" -gt "0" ] # depth specified?
                then
                    DEPTH="${1}"
                    shift
                else
                   ${ECHO} "No depth specified."
                   exit 1
                fi;;
            -n) shift # request for number of URL's in output file?
                if [ "$#" -gt "0" ] # number specified?
                then
                    NUMBER="${1}"
                    shift
                else
                   ${ECHO} "No number specified."
                   exit 1
                fi;;
            *) break;;
        esac
    done
    if [ ${retval} = "0" ] # no errors?
    then
        retval="1" # assume error
        if [ "${#}" -gt "1" ] # enough arguments?
        then
            CRITERIA="${1}" # save the relevance criteria
            shift
            if [ -f "${1}" ] # first argument a file name?
            then
                if "${WGET}" -nc -r -l${DEPTH} -H -x -t3 -T15 -P "${WWW}" -Ahtml,htm -i "${1}" # get the URL's in the file name's html page(s)
                then
                    if "${CD}" "${WWW}" # change to the ${WWW} directory
                    then
                        if "${HTMLREL}" -n "${NUMBER}" "${CRITERIA}" * > ../"${WWW}.html"
                        then
                            if "${CD}" "../" # change to the ../ directory
                            then
                                retval="0" # assume no errors
                            else
                                "${ECHO}" "Error changing to directory ../"
                            fi
                        else
                            "${ECHO}" "Error getting relevance of html page(s)"
                        fi
                    else
                        "${ECHO}" "Error changing to directory ${WWW}"
                    fi
                else
                    "${ECHO}" "Error getting html page(s)"
                fi
            else
                if "${WGET}" -nc -r -l${DEPTH} -H -x -t3 -T15 -P "${WWW}" -Ahtml,htm "${@}" # get the URL's html page(s)
                then
                    if "${CD}" "${WWW}" # change to the ${WWW} directory
                    then
                        if "${HTMLREL}" -n "${NUMBER}" "${CRITERIA}" * > ../"${WWW}.html"
                        then
                            if "${CD}" "../" # change to the ../ directory
                            then
                                retval="0" # assume no errors
                            else
                                "${ECHO}" "Error changing to directory ../"
                            fi
                        else
                            "${ECHO}" "Error getting relevance of html page(s)"
                        fi
                    else
                        "${ECHO}" "Error changing to directory ${WWW}"
                    fi
                else
                    "${ECHO}" "Error getting html page(s)"
                fi
            fi
        else
            echo "Usage: $0 [-l n] [-n n] criteria <htmlfile | url1 url2 ...>"
        fi
    fi
else
    "${ECHO}" "Error creating directory ${WWW}"
fi
