# bol.nl.com scraper
#
# copied from various sources
#

import  os
import  string

from    com.readerware.http import HTTPConnection

from    scrapers.scrapers import convertAuthor
from    scrapers.scrapers import searchFor
from    scrapers.scrapers import searchForPlus
from    scrapers.scrapers import stripText
from    scrapers.scrapers import stripHTML
from    scrapers.scrapers import stripNewLines

# remove any <a> tags, keeping everything else
# assume <a href="xxx">yyy</a>
def removeAtags(str):

    cat = ""
    while (str != None):
        # save anything before
        tag = "<a"
        i = string.find(str, tag)
        if i == -1:
            tag = "<A"
            i = string.find(str, tag)
        if i == -1:
            cat = cat + str
            return cat
        else:
            if i > 0:
                cat = cat + str[:i]
                str = searchForPlus(str, tag)

        # move marker to "inside"
        tag = ">"
        i = string.find(str, tag)
        if i == -1:
            return cat
        else:
            str = searchForPlus(str, tag)

        tag = "</a>"
        i = string.find(str, tag)
        if i == -1:
            tag = "</A>"
            i = string.find(str, tag)
        if i == -1:
            return cat
        else:
            cat = cat + str[:i]
            str = searchForPlus(str, tag)

def translate_nl(str):
    str = string.replace (str, "Literatuur", "Literature")
    str = string.replace (str, "Vertaalde", "Translated")
    str = string.replace (str, "Romans", "Novels")
    str = string.replace (str, "Verhalen", "Stories")
    str = string.replace (str, "Spanning & Thrillers", "Thrillers")
    str = string.replace (str, "Geschiedenis", "History")
    str = string.replace (str, "Overig", "Other")
    str = string.replace (str, "Boeken /", "")
    return string.strip(str);

def extract():
    global title,author,isbn,publisher,format,first,signed,date,place
    global copies,rating,condition,category,read,pflag,eflag,value
    global comments,dateEntered,dataSource,cart,ordered
    global lccn,dewey,userNumber,copyDate,valueDate,location
    global series,pages,keywords,dimensions
    global user1,user2,user3,user4,user5,user6,user7,user8,user9,user10
    global author2,author3,author4,author5,author6
    global usedprice,usedcount,collectibleprice,collectiblecount
    global newprice,newcount,listprice,readinglevel,salesrank,available
    global buyerwaiting,editionNumber,weight,image
    global fullDateFormat,source


    # Defaults
    first               = "N"
    signed              = "N"
    read                = "N"
    pflag               = "Y"
    eflag               = "Y"
    ordered             = "N"
    usedprice           = ""
    usedcount           = ""
    collectibleprice    = ""
    collectiblecount    = ""
    newprice            = ""
    newcount            = ""
    listprice           = ""
    readinglevel        = ""
    salesrank           = ""
    available           = "Y"
    buyerwaiting        = "N"
    weight              = ""


    # No place extraction default to US
    # place = "Netherlands"


    # is this the search page or the detail page?
    j = string.find (source, "ORENGE_parametric_search_result_nl")
    #print "j is", j
    #print "320\n", source[:320]
    if j != -1: 
        source = searchForPlus(source, "Tabelle Title Start")

        # get the details page - it's easier that way.
        source = searchForPlus(source, "<a href=")
        # url delimitation is double quote, but just in case we check
        i = string.find(source[1:], source[0:1])
        source = source[1:]
        #print source
        url = stripText(source[0:i]);

        http = HTTPConnection()
        http.resetReferer();
        http.blockForLoad();
        source = http.getContents(url)
        t2 = open("trace2.html", "w")
        t2.write(source)
        t2.close()

    #
    # this should be the product page

    # get the absolute uri part, in case required for images
    tempsource = string.find(source, "href=\"http://")
    tempsource = searchForPlus(source, "href=\"http://")
    j = string.find(tempsource, "/")
    uri = "http://" + stripText(tempsource[:j+1])


   
    # Find optional fields, pricing info etc.
    marketinfo = source


    # Find List price
    i = string.find(marketinfo, "Adviesprijs:")

    if (i != -1):
        listprice = searchForPlus(marketinfo, "bol.com prijs:")
        # now the euro symbol - stored as #8364;&nbsp;PRICE
        listprice = searchForPlus (listprice, "8364; ")
        i = string.find(listprice, "<")
        listprice = stripText(listprice[0:i])


    # Find Price
    i = string.find(marketinfo, "bol.com prijs:")

    if i != -1:
        value = searchForPlus(marketinfo, "bol.com prijs:")
        value = searchForPlus(value, "class=\"price\"")
        value = searchForPlus(value, ">")
        i = string.find(value, "<")
        value = stripText(value[0:i])
        value = stripHTML(value)
        value = string.replace(value, "&euro;", "")


    # Check for availability
    i = string.find(source, "Deze titel is momenteel niet leverbaar")

    if i != -1:
       available = "N";
 

    # Find Title
    tag = "END endeca_books/Left_panel_product"
    source = searchForPlus(source, tag)
    if source == None:
        print "can't find title"
        return

    source = searchForPlus(source, "<h1>")
    i = string.find(source, "<");
    title = stripText(source[0:i])
    source = stripText(source[i:])


    # Find Cover Image
    tag = "BEGIN endeca_books/book_slot"
    source = searchForPlus(source, tag)
    source = searchForPlus(source, "<tr valign=\"top\">")
    source = searchForPlus(source, "img src=\"")
    i = string.find(source, "\"")
    if i < 200:
        image = stripText(source[0:i])
        if (image[0] == "/"):
            image = uri + image;

        i = string.find(image, "BOL_DEF_book_large.gif")

        if i != -1:
            image = ""
    else:
        #print "i too big ", i
        image = "";
    #print "image is ", image


    # Find Author
    # author is in an href tag, comma separated authors
    authorList = []

    i = string.find(source, "&AUTHOR_ID=")

    if i != -1:
        tempsource = searchForPlus(source, "&AUTHOR_ID=")
        tempsource = searchForPlus(tempsource, ">")
        i = string.find(tempsource, "<")
        author = stripText(tempsource[0:i])
        i = string.rfind(author, " ")

        if i != -1:
            author = stripText(author[i:]) + ", " + stripText(author[0:i])

        authorList.append(author)
        haveAuthor = 1
    else:
        haveAuthor = 0

    if (i > 500):
        # Author too far away
        haveAuthor = 0

    if haveAuthor == 1:

        if tempsource != None:
            tempsource = searchForPlus(tempsource, ">")
            source = tempsource;

        while (tempsource != None):
            #print "WHILE LOOP", tempsource[:60]
            #check that we haven't hit translators
            author_index = string.find(tempsource, "BOL_ARTIST_ID")
            translator_index = string.find(tempsource, "Vertaler:")

            #print "auth ", author_index, " trans ", translator_index
            if author_index == -1:
                break;
            if author_index > 300:
                break;
            if translator_index == -1 or (author_index < translator_index):

                i = string.find(tempsource, "<")
                if i>0 & i < 500:
                   tempsource = searchForPlus(tempsource, "BOL_ARTIST_ID=")
                   tempsource = searchForPlus(tempsource, ">")
                   i = string.find(tempsource, "<")
                   authors = stripText(tempsource[0:i])
                   authorList.append(authors)

                   #prepare for next iteration
                   tempsource = searchForPlus(tempsource, "a>")
                else:
                   #bail out
                   tempsource = None
            else:
                tempsource = None

        if len(authorList) > 0:
            author = authorList[0]

        if len(authorList) > 1:
            author2 = authorList[1]

        if len(authorList) > 2:
            author3 = authorList[2]

        if len(authorList) > 3:
            author4 = authorList[3]

        if len(authorList) > 4:
            author5 = authorList[4]

        if len(authorList) > 5:
            author6 = authorList[5]

    if title != "" and len(authorList) == 0:
        author = "No Author"


    # Find attributes
    i = string.find(source, "END endeca_books/book_slot")

    if i != -1:
        attributes = stripText(source[0:i])

        i = string.find(attributes, " | ")

        if i != -1:
            attributes = stripText(attributes[i-100:])

        # Find Format
        tag= "class=\"small\">"
        i = string.find(attributes, tag)

        if i != -1:
            attributes = searchForPlus(attributes, tag)
            i = string.find(attributes, "|")
            format = stripText(attributes[0:i])


        # Find Pages - note sometimes absent
        i = string.find(attributes, "Pagina's")

        if i != -1:
            attributes = stripText(attributes[i-20:])
            attributes = searchForPlus(attributes, "|")
            i = string.find(attributes, "Pagina's")
            pages = stripText(attributes[0:i])

        # Find Publisher
        found_publisher = 0
        i = string.find(attributes, "|")

        if (i != -1 and i<200):
            attributes = searchForPlus(attributes, "|")
            i = string.find(attributes, "|")
            if i == -1 or i > 200:
                i = string.find(attributes, "<")
            if i != -1 or i < 200:
                publisher = stripText(attributes[0:i])
                publisher = stripHTML(attributes[0:i])
                found_publisher = 1
                i = string.find(publisher, ";")
                if i != -1:
                    publisher = stripText(publisher[0:i])

        # Find Published Date
        # only if valid publisher
        i = string.find(attributes, "|")

        if (i != -1 and i < 200 and found_publisher == 1):
            attributes = searchForPlus(attributes, "|")
            i = string.find(attributes, "<")
            date = stripText(attributes[0:i])

            if fullDateFormat == "false":
                i = string.rfind(date, " ")

                if i != -1:
                    date = stripText(date[i+1:])


        # Find ISBN
        i = string.find(attributes, "ISBN:")

        if i != -1:
            attributes = searchForPlus(attributes, "ISBN:")
            i = string.find(attributes, "\n")
            temp = stripHTML(attributes[0:i])

            if temp != "":
                isbn = temp

    # Find Rating
    i = string.find(attributes, "klantenbeoordeling")
    if i != -1:
        attributes = searchForPlus(attributes, "klantenbeoordeling")
        i = string.find(attributes, "icon_rating_")
        text_rating = stripText(attributes[i+12:i+14])
        rating = text_rating[0] + "." + text_rating[1] + " Stars"

    # Find Keywords
    i = string.find(source, "Trefwoorden:");
    print "KEYWORDS", i
    if (i != -1):
        keywords = searchForPlus(source, "Trefwoorden:")
        i = string.find(keywords, "<")
        keywords = stripText(keywords[0:i]);

    # Find Category
    i = string.find(source, "Dit product komt voor in:")

    if i != -1:
        attributes = searchForPlus(source, "Dit product komt voor in:")
        attributes = searchForPlus(attributes, "href =")
        attributes = searchForPlus(attributes, "\">")
        i = string.find(attributes, "<")

        nl_cat = removeAtags(attributes[0:i])
        nl_cat = string.replace (nl_cat, "\n", "")
        nl_cat = stripHTML(nl_cat)
        category = translate_nl(nl_cat)
        #


    # Find Dimensions
    i = string.find(attributes, "Afmetingen (mm):")

    if i != -1:
        attributes = searchForPlus(attributes, "Afmetingen (mm):")
        i = string.find(attributes, "|")
        dimensions = stripText(attributes[0:i])

    # Find Comments 
    i = string.find(source, "<h2>Beschrijving:</h2>")

    if i != -1:
        comments = searchForPlus(source, "<h2>Beschrijving:</h2>")
        i = string.find(comments, "</td>")
        comments = stripText(comments[0:i])
        comments = string.replace(comments, "<BR>", "\n")
        comments = string.replace(comments, "</a>", "")

        while (searchFor(comments, "<a ") != None):
                i = string.find(comments, "<a ")
                j = string.find(comments[i:], ">")
                comments = comments[0:i] + comments[i+j+1:]






try:
    extract()
finally:
    if os.path.exists("scrapers/userexit.py"):
        execfile("scrapers/userexit.py") in globals()
