# Scraper for Koninklijke Bibliotheek [NL]
#   National library of the Netherlands
#   this is the nearest you will get to a reference library
#   of Dutch language works
#
# Correct as of August 2005
#

import  os
import  string

from    com.readerware.http import HTTPConnection

from    scrapers.scrapers import convertAuthor
from    scrapers.scrapers import searchFor
from    scrapers.scrapers import searchForPlus
from    scrapers.scrapers import stripText
from    scrapers.scrapers import stripHTML
from    scrapers.scrapers import stripNewLines

# remove any <a> tags, keeping everything else
# assume PRE<a href="xxx">yyy</a>POST
# This returns PREyyyPOST
def removeAtags(str):

    cat = ""
    while (str != None):
        # save anything before
        tag = "<a"
        i = string.find(str, tag)
        if i == -1:
            tag = "<A"
            i = string.find(str, tag)
        if i == -1:
            cat = cat + str
            return cat
        else:
            if i > 0:
                cat = cat + str[:i]
                str = searchForPlus(str, tag)

        # move marker to "inside"
        tag = ">"
        i = string.find(str, tag)
        if i == -1:
            return cat
        else:
            str = searchForPlus(str, tag)

        tag = "</a>"
        i = string.find(str, tag)
        if i == -1:
            tag = "</A>"
            i = string.find(str, tag)
        if i == -1:
            return cat
        else:
            cat = cat + str[:i]
            str = searchForPlus(str, tag)


def translate_nl(str):
    str = string.replace (str, "Geschiedenis", "History")
    str = string.replace (str, "Pocket", "Mass Market Paperback")
    str = string.replace (str, "Populaire pockets", "Popular Reading")
    str = string.replace (str, "romans en novellen", "Literature")
    str = string.replace (str, "oorspr. - Nederlands", "Dutch")
    str = string.replace (str, "vertaald", "translated (Dutch)")
    str = string.replace (str, "geb.", "bound")
    str = string.replace (str, "pbk.", "paperback")
    str = string.replace (str, "woordenboeken", "Dictionaries")
    return string.strip(str);

def extract():
    global title,author,isbn,publisher,format,first,signed,date,place
    global copies,rating,condition,category,read,pflag,eflag,value
    global comments,dateEntered,dataSource,cart,ordered
    global lccn,dewey,userNumber,copyDate,valueDate,location
    global series,pages,keywords,dimensions
    global user1,user2,user3,user4,user5,user6,user7,user8,user9,user10
    global author2,author3,author4,author5,author6
    global usedprice,usedcount,collectibleprice,collectiblecount
    global newprice,newcount,listprice,readinglevel,salesrank,available
    global buyerwaiting,editionNumber,weight,image
    global fullDateFormat,source


    # Defaults
    first               = "N"
    signed              = "N"
    read                = "N"
    pflag               = "Y"
    eflag               = "Y"
    ordered             = "N"
    usedprice           = ""
    usedcount           = ""
    collectibleprice    = ""
    collectiblecount    = ""
    newprice            = ""
    newcount            = ""
    listprice           = ""
    readinglevel        = ""
    salesrank           = ""
    available           = ""
    buyerwaiting        = ""
    weight              = ""
    cart                = ""


    # No place extraction default to Nethlands
    # probably true - this is the reference library
    # place = "Netherlands"

    # save base if it exists
    i = string.find (source, "BASE href=\"")
    if i != -1:
        base = searchForPlus(source,  "BASE href=\"");
        i = string.find (base, "\"")
        base = stripText(base[0:i])
    else:
        base = "" 

    # is this the right page?
    start_tag = "class=\"h2\">results<"
    j = string.find (source, start_tag)
    if j == -1: 
        ## print "Can't parse page"
        return
    else:
        source = searchForPlus(source, start_tag)

    # First case - no match
    i = string.find (source, "There are no hits.")
    if i != -1:
        # not found
        print "No hits"
        return

    # either we have the book page, or a list of editions sharing the
    # same ISBN

    # test for the multiple case.
    i = string.find (source, "There are")
    if i > 0:
        hits = searchForPlus(source, "There are")
        i = string.find (hits, "hits")
        hits = stripText(hits[0:i]);

        print "Hits:", hits
        i = string.find (source, "This is hit 1")
        if i == -1:
            # just use the first and hope that it is correct

            if base == "":
                print "no base page"
                return

            url = base + "LNG=EN/SHW?FRST=1"
            http = HTTPConnection()
            ##http.resetReferer();
            http.blockForLoad();
            print "GET URL", url
            source = http.getContents(url)

            comments = "Extracted one edition out of choice from KB NL- check for correct edition"

#    i = string.find (source, "This is hit 1")
#    if i == -1:
#        print "PROBLEM with page"
#        return

    #
    # this should be the product page
    #

    # Find Title
    tag = "<strong>Title:"
    i = string.find(source, tag)
    source = searchForPlus(source, tag)

    # if we have title, that source is better, otherwise get it from here
    # this one may contain extra (unwanted) stuff
    tag = "class=\"presvalue\">"
    i = string.find(source, tag)
    if (i != -1):
        source = searchForPlus(source, tag)
        i = string.find(source, ":");
        j = string.find(source, " / ");
        if i == -1 or ( j != -1 and i > j):
            i = j
        ## print "I J", i, j, "text", source[0:40]
        title = stripText(source[0:i])
        title = removeAtags(title)
        source = searchForPlus(source, " / ")

        j = string.find(source, ";")
        i = string.find(source, "</td")
        k = string.find(source, "[")
        l = string.find(source, "<BR")
    
        ##print "I J K", i, j, k
        # Which comes first?
        if l == -1:
             l = 9999;
        if k == -1:
             k = 9999;
        if j == -1:
             j = 9999;
        if j < i:
             i = j
        if k < i:
             i = k
        if l < i:
             i = l
        author = stripText(source[0:i])
        author = stripHTML(author)
        author = string.replace(author, "<BR>", "")
        # this could be multiple authors, comma-separated
        #
        authorList = []
        while author != "":
            i = string.find (author, ",")
            if i != -1:
                aauthor = stripText(author[0:i])
                aauthor = convertAuthor (aauthor)
                authorList.append (aauthor)
                author = stripText (author[i+1:])
            else:
                aauthor = author
                aauthor = convertAuthor (aauthor)
                authorList.append (aauthor)
                author = ""
                break;

        if len(authorList) > 0:
            author = authorList[0]

        if len(authorList) > 1:
            author2 = authorList[1]

        if len(authorList) > 2:
            author3 = authorList[2]

    else:
        # no title
        print "No TITLE"
        return


    separator = "class=\"presvalue\">"
    # Find Year
    marker = "<strong>Year:"
    i = string.find(source, marker)
    if (i != -1):
        source = searchForPlus(source, marker)
        i = string.find(source, separator)
        if (i != -1 and i < 50):
            source = searchForPlus(source, separator)
            i = string.find(source, "</td")
            date = stripText(source[0:i])
            # sometimes, its a compilation
            i = string.find(date, "cop.")
            if i != -1:
                date = stripText(date[i+4:])
            # sometimes the year is surrounded by []
            i = string.find(date, "[")
            j = string.find(date, "]")
            if i != -1 and j != -1:
                date = stripText(date[i+1:j])

    # Find Publisher + Place
    # sometimes there are multpiple publishers (1 NL, 1 BE)
    marker = "<strong>Publisher:"
    i = string.find(source, marker)
    if (i != -1):
        source = searchForPlus(source, marker)
        i = string.find(source, separator)
        if (i != -1 and i < 50):
            source = searchForPlus(source, separator)

            found_end = 0
            while found_end != 1:
                j = string.find(source, ":");
                if j != -1:
                    aplace = stripText(source[0:j])
                    aplace = removeAtags(aplace)
                    if place == "":
                        place = aplace
                    else:
                        place = place + ", " + aplace

                    source = searchForPlus(source, ":")
                else:
                    found_end = 1

                i = string.find(source, "</td")
                j = string.find(source, "<BR>")
                ##print "I J", i, j
                if (j != -1 and j < i):
                    # multiple publisher
                    i = j
                    found_end = 0
                else:
                    found_end = 1

                apublisher = stripText(source[0:i])
                apublisher = removeAtags(apublisher)
                if publisher == "":
                    publisher = apublisher
                else:
                    publisher = publisher + ", " + apublisher
                source = stripText(source[i+4:])


    # Find Series
    marker = "<strong>Series:"
    i = string.find(source, marker)
    if (i != -1):
        source = searchForPlus(source, marker)
        source = searchForPlus(source, "\">")
        i = string.find(source, "<")
        series = stripText(source[0:i])
        series = removeAtags(series)


    # Find Pages + Size
    marker = "<strong>Extent:"
    i = string.find(source, marker)
    if (i != -1):
        source = searchForPlus(source, marker)
        i = string.find(source, separator)
        if (i != -1 and i < 50):
            source = searchForPlus(source, separator)
            j = string.find(source, "p.");
            if j != -1:
                pages = stripText(source[0:j])
                source = searchForPlus(source, "p.")
                j = string.find(source, "; ");
                if j != -1 and j < 20:
                    source = searchForPlus(source, "; ")

            i = string.find(source, "</td")
            dimensions = stripText(source[0:i])
            dimensions = removeAtags(dimensions)


    # Find ISBN and format
    marker = "<strong>ISBN:"
    i = string.find(source, marker)
    if (i != -1):
        source = searchForPlus(source, marker)
        i = string.find(source, separator)
        if (i != -1 and i < 50):
            source = searchForPlus(source, separator)
            i = string.find(source, "<strong>")
            ##print "ISBN 1", i, j
            ##print "SRC", source[0:40]
            if i != -1 and i < 50:
                source = searchForPlus(source, "<strong>")
            j = string.find(source, "</strong>");
            if j != -1:
                ##print "ISBN 2", i, j
                ##print "SRC", source[0:40]
                isbn = stripText(source[0:j])
                ## isbn = string.replace(isbn, "-", "")
                #source = searchForPlus(source, "</strong>")
                ## look for , or < (or ":" = no format)
                #i = string.find (source, "<")
                #j = string.find (source, ",")
                #k = string.find (source, ":")
                #if i != -1 and i < j:
                #    if i < 50 and (k != -1 and i < k):
                #        format = stripText(source[0:i])
                #else:
                #    if j < 50 and (k != -1 and j < k):
                #        format = stripText(source[0:j])
                #format = translate_nl (format)

    # Find Category
    # more complicated. There are a number of possibilities:
    #   Subject heading depository library (Netherlands)
    #   Subject heading GOO
    #   Basic classifcation
    #   Code KB:
    # some spaces are replaced with &nbsp;
    category1 = "Basic&nbsp;classification:"
    category2 = "Subject&#xA0;heading&#xA0;GOO:"
    category3 = "Subject&#xA0;heading depository&#xA0;library (Netherlands):"

    # our preference list, in order
    category_markers = [category3, category1, category2]

    for basic_marker in category_markers:
        marker = "strong>" + basic_marker
        i = string.find(source, marker)
        cat_source = source
        ## print "MARKER",  marker, "I", i
        if (i != -1):
            cat_source = searchForPlus(cat_source, marker)
            i = string.find(cat_source, separator)

            if (i != -1 and i < 50):
                cat_source = searchForPlus(cat_source, separator)
                i = string.find(cat_source, "</td")
                nl_category = stripText(cat_source[0:i])
                nl_category = removeAtags(nl_category)
                nl_category = stripHTML(nl_category)
                category = translate_nl(nl_category)
                category = string.replace (category, "<BR>", "")
                # we have one, that's enough
                break

    return

try:
    extract()
finally:
    if os.path.exists("scrapers/userexit.py"):
        execfile("scrapers/userexit.py") in globals()
