# proxis.be.scraper
# this one is funny. proxis operates in English, French and Dutch
# we will try to make it work in any langage.....
#

import  os
import  string

from    com.readerware.http import HTTPConnection

from    scrapers.scrapers import convertAuthor
from    scrapers.scrapers import searchFor
from    scrapers.scrapers import searchForPlus
from    scrapers.scrapers import stripText
from    scrapers.scrapers import stripHTML
from    scrapers.scrapers import stripNewLines
from    scrapers.scrapers import calcISBN13CheckDigit

# remove any <a> tags, keeping everything else
# assume PRE<a href="xxx">yyy</a>POST
# This returns PREyyyPOST
def removeAtags(str):

    cat = ""
    while (str != None):
        # save anything before
        tag = "<a"
        i = string.find(str, tag)
        if i == -1:
            tag = "<A"
            i = string.find(str, tag)
        if i == -1:
            cat = cat + str
            return cat
        else:
            if i > 0:
                cat = cat + str[:i]
                str = searchForPlus(str, tag)

        # move marker to "inside"
        tag = ">"
        i = string.find(str, tag)
        if i == -1:
            return cat
        else:
            str = searchForPlus(str, tag)

        tag = "</a>"
        i = string.find(str, tag)
        if i == -1:
            tag = "</A>"
            i = string.find(str, tag)
        if i == -1:
            return cat
        else:
            cat = cat + str[:i]
            str = searchForPlus(str, tag)


def translate_nl(str):
    str = string.replace (str, "Literatuur", "Literature")
    str = string.replace (str, "Literaire romans", "Literature")
    str = string.replace (str, "Vertaalde", "Translated")
    str = string.replace (str, "Romans", "Novels")
    str = string.replace (str, "Verhalen", "Stories")
    str = string.replace (str, "Recht", "Law")
    str = string.replace (str, "Computers & informatica", "IT")
    str = string.replace (str, "Geschiedeniswetenschappen", "History of Science")
    str = string.replace (str, "Geschiedenis", "History")
    str = string.replace (str, "Pocket", "Mass Market Paperback")
    str = string.replace (str, "Populaire pockets", "Popular Reading")
    return string.strip(str);

def extract():
    global title,author,isbn,publisher,format,first,signed,date,place
    global copies,rating,condition,category,read,pflag,eflag,value
    global comments,dateEntered,dataSource,cart,ordered
    global lccn,dewey,userNumber,copyDate,valueDate,location
    global series,pages,keywords,dimensions
    global user1,user2,user3,user4,user5,user6,user7,user8,user9,user10
    global author2,author3,author4,author5,author6
    global usedprice,usedcount,collectibleprice,collectiblecount
    global newprice,newcount,listprice,readinglevel,salesrank,available
    global buyerwaiting,editionNumber,weight,image
    global fullDateFormat,source


    # Defaults
    first               = "N"
    signed              = "N"
    read                = "N"
    pflag               = "Y"
    eflag               = "Y"
    ordered             = "N"
    usedprice           = ""
    usedcount           = ""
    collectibleprice    = ""
    collectiblecount    = ""
    newprice            = ""
    newcount            = ""
    listprice           = ""
    readinglevel        = ""
    salesrank           = ""
    available           = "Y"
    buyerwaiting        = "N"
    weight              = ""


    # No place extraction default to US
    # place = "Netherlands"

   
    # Find optional fields, pricing info etc.
    marketinfo = source

    firstmarker = "class=\"verd_13_b\">"
    i = string.find(source, firstmarker)

    if i == -1:
        firstmarker = "<span class=titlecolor>"
        i = string.find(source, firstmarker)

        if i == -1:
            # try search again with ISBN-13
            source = searchForPlus(source, "class=\"search-summary-keyword\"")
            source = searchForPlus(source, ">")
            i = string.find(source, "<");
            isbn13 = stripText(source[0:i])

            if len(isbn13) == 10:
                isbn13 = "978" + isbn13[0:9]
                isbn13 = calcISBN13CheckDigit(isbn13)

                url = "http://oas2000.proxis.be/gate/jabba.searchII.do_search?p_keyword=" + isbn13
                http = HTTPConnection()
                http.resetReferer();
                http.blockForLoad();
                source = http.getContents(url)
                t2 = open("trace2.html", "w")
                t2.write(source)
                t2.close()

    firstmarker = "class=\"verd_13_b\">"
    i = string.find(source, firstmarker)

    if i == -1:
        firstmarker = "<span class=titlecolor>"
        i = string.find(source, firstmarker)

        if i == -1:
            # can't even find the title. bail out
            #print "cant find TITLE"
            return

    # Find title
    title = searchForPlus(source, firstmarker)
    i = string.find(title, "<")
    title = stripText(title[0:i])

    # print "TITLE", title
    # Now, just get the "more info link" The rest of this page is
    # unreliable

    # is this the search page or the detail page?
    j = string.find (source, "1. ")
    if j != -1: 

        source = searchForPlus(source, "1. ")
        # get the details page - it's easier that way.
        source = searchForPlus(source, "<a href=")
        # url delimitation is double quote, but just in case we check
        i = string.find(source[1:], source[0:1])
        source = source[1:]
        #print source
        url = stripText(source[0:i]);
        #print "URL", url


        http = HTTPConnection()
        http.resetReferer();
        http.blockForLoad();
        source = http.getContents(url)
        t2 = open("trace2.html", "w")
        t2.write(source)
        t2.close()
    else:
        i = string.find (source, "<TD class=SUBPRICE>ISBN</TD>")
        if i == -1:
            # we cant find anything, bail out
            return;

    #
    # this should be the product page
    #

    # Find Title
    tag = "class=\"verd_13_b\">"
    source = searchForPlus(source, tag)
    if source == None:
        print "can't find TITLE"
        return
    # if we have title, that source is better, otherwise get it from here
    # this one may contain extra (unwanted) stuff
    if title != "":
        i = string.find(source, "<");
        title = stripText(source[0:i])
        source = stripText(source[i:])

    # Find "star" rating
    # this should be next, but sometimes is not there. In this case, we use
    # the first rating
    tag = "/imgs/stars-"
    i = string.find(source, tag)
    if (i != -1):
        starsource = searchForPlus(source, tag)
        i = string.find(starsource, ".gif");
        rating_image = stripText(starsource[0:i])
        r=rating_image;
        r=string.replace(r, "-", ".")
        # this one rates 1-7. we'll convert to 1-5 range
        if r == "0.5":
          rating = "0.5"
        elif r == "1.0":
          rating = "0.5"
        elif r == "1.5":
          rating = "1.0"
        elif r == "2.0":
          rating = "1.5"
        elif r == "2.5":
          rating = "2.0"
        elif r == "3.0":
          rating = "2.0"
        elif r == "3.5":
          rating = "2.5"
        elif r == "4.0":
          rating = "3.0"
        elif r == "4.5":
          rating = "3.0"
        elif r == "5.0":
          rating = "3.0"
        elif r == "5.5":
          rating = "4.0"
        elif r == "6.0":
          rating = "4.5"
        elif r == "6.5":
          rating = "4.5"
        elif r == "7.0":
          rating = "5.0"
        rating = rating + " Stars"

    # Find Author
    tag = "<span class=\"AUTHOR\">"
    i = string.find(source, tag)
    if (i != -1):
        source = searchForPlus(source, tag)
        i = string.find(source, "<");
        author = stripText(source[:i-1])
        author = stripHTML(author)


    # Find Cover Image
    tag = "class=image width="
    i = string.find(source, tag)
    if (i != -1):
        source = searchForPlus(source, tag)
        tag = "<img src=\""
        i = string.find(source, tag)
        if (i != -1 and i < 100):
            source = searchForPlus(source, tag)
            i = string.find(source, "\"")
            image = stripText(source[0:i])
        else:
            #print "i too big ", i
            image = "";
    # print "Image is ", image


    # Find attributes
    separator = ": "

    # Find Format
    marker = "Type"
    i = string.find(source, marker)
    if (i != -1):
        source = searchForPlus(source, marker)
        i = string.find(source, separator)
        if (i != -1 and i < 50):
            source = searchForPlus(source, separator)
            i = string.find(source, "<")
            format = stripText(source[0:i])
            format = translate_nl(format)

    # Find Publisher
    marker = "Publisher"
    i = string.find(source, marker)
    if (i != -1):
        source = searchForPlus(source, marker)
        i = string.find(source, separator)
        if (i != -1 and i < 50):
            source = searchForPlus(source, separator)
            i = string.find(source, "<")
            publisher = stripText(source[0:i])
    else:
        marker = ">Uitgever<"
        i = string.find(source, marker)
        if (i != -1):
            source = searchForPlus(source, marker)
            i = string.find(source, separator)
            if (i != -1 and i < 50):
                source = searchForPlus(source, separator)
                i = string.find(source, "<")
                publisher = stripText(source[0:i])

    # Find Publication Date
    date = ""
    marker = "Publication date"
    i = string.find(source, marker)
    if (i != -1):
        source = searchForPlus(source, marker)
        i = string.find(source, separator)
        if (i != -1 and i < 50):
            source = searchForPlus(source, separator)
            i = string.find(source, "<")
            date = stripText(source[0:i])
            if date == "01/01/0001":
                date = ""
    else:
        marker = ">Verschijningsdatum<"
        i = string.find(source, marker)
        if (i != -1):
            source = searchForPlus(source, marker)
            i = string.find(source, separator)
            if (i != -1 and i < 50):
                source = searchForPlus(source, separator)
                i = string.find(source, "<")
                date = stripText(source[0:i])
                if date == "01/01/0001":
                    date = ""

    if date != "":
        if fullDateFormat == "false":
            i = string.rfind(date, "/")

            if i != -1:
                date = stripText(date[i+1:])

    # Find Pages
    marker = "Pages"
    i = string.find(source, marker)
    if (i != -1):
        source = searchForPlus(source, marker)
        i = string.find(source, separator)
        if (i != -1 and i < 50):
            source = searchForPlus(source, separator)
            i = string.find(source, "<")
            pages = stripText(source[0:i])

    # Find Dimensions (2 possibilities - #1: 3 fields, length, width, height)
    # or #2: 1 field (Format)
    marker = "Format"
    i = string.find(source, marker)
    if (i != -1):
        source = searchForPlus(source, marker)
        i = string.find(source, separator)
        if (i != -1 and i < 50):
            source = searchForPlus(source, separator)
            i = string.find(source, "<")
            dimensions = stripText(source[0:i])
    else:
        length = ""
        width = ""
        height = ""
        marker = "Length"
        i = string.find(source, marker)
        if (i != -1):
            source = searchForPlus(source, marker)
            i = string.find(source, separator)
            if (i != -1 and i < 50):
                source = searchForPlus(source, separator)
                i = string.find(source, "cm.")
                if i == -1:
                    i = string.find(source, "<")
                length = stripText(source[0:i])
        else:
            marker = ">Lengte<"
            i = string.find(source, marker)
            if (i != -1):
                source = searchForPlus(source, marker)
                i = string.find(source, separator)
                if (i != -1 and i < 50):
                    source = searchForPlus(source, separator)
                    i = string.find(source, "cm.")
                    if i == -1:
                        i = string.find(source, "<")
                    length = stripText(source[0:i])
        marker = "Width"
        i = string.find(source, marker)
        if (i != -1):
            source = searchForPlus(source, marker)
            i = string.find(source, separator)
            if (i != -1 and i < 50):
                source = searchForPlus(source, separator)
                i = string.find(source, "cm.")
                if i == -1:
                    i = string.find(source, "<")
                width = stripText(source[0:i])
        else:
            marker = ">Breedte<"
            i = string.find(source, marker)
            if (i != -1):
                source = searchForPlus(source, marker)
                i = string.find(source, separator)
                if (i != -1 and i < 50):
                    source = searchForPlus(source, separator)
                    i = string.find(source, "cm.")
                    if i == -1:
                        i = string.find(source, "<")
                    width = stripText(source[0:i])
        marker = "Height"
        i = string.find(source, marker)
        if (i != -1):
            source = searchForPlus(source, marker)
            i = string.find(source, separator)
            if (i != -1 and i < 50):
                source = searchForPlus(source, separator)
                i = string.find(source, "cm.")
                if i == -1:
                    i = string.find(source, "<")
                height = stripText(source[0:i])
        else:
            marker = ">Hoogte<"
            i = string.find(source, marker)
            if (i != -1):
                source = searchForPlus(source, marker)
                i = string.find(source, separator)
                if (i != -1 and i < 50):
                    source = searchForPlus(source, separator)
                    i = string.find(source, "cm.")
                    if i == -1:
                        i = string.find(source, "<")
                    height = stripText(source[0:i])
        if (length != "" and width != ""):
            dimensions = length + "x" + width + "x" + height + "cm"

    # Find ISBN
    marker = "ISBN"
    i = string.find(source, marker)
    if (i != -1):
        source = searchForPlus(source, marker)
        i = string.find(source, separator)
        if (i != -1 and i < 50):
            source = searchForPlus(source, separator)
            i = string.find(source, "<")
            isbn = stripText(source[0:i])


    # Find Category
    marker = "Category"
    i = string.find(source, marker)
    if (i != -1):
        source = searchForPlus(source, marker)
        i = string.find(source, separator)

        if (i != -1 and i < 50):
            source = searchForPlus(source, separator)
            i = string.find(source, "</td")
            nl_category = stripText(source[0:i])
            nl_category = removeAtags(nl_category)
            nl_category = stripHTML(nl_category)
            category = translate_nl(nl_category)
    else:
        marker = ">Categorie<"
        i = string.find(source, marker)
        if (i != -1):
            source = searchForPlus(source, marker)
            i = string.find(source, separator)

            if (i != -1 and i < 50):
                source = searchForPlus(source, separator)
                i = string.find(source, "</td")
                nl_category = stripText(source[0:i])
                nl_category = removeAtags(nl_category)
                nl_category = stripHTML(nl_category)
                category = translate_nl(nl_category)

    # Find PRICE
    i = string.find(source, "EUR<")

    if i != -1:
        value = " " + stripText(source[i-7:i])
        value = string.replace(value, ".", ",")

    # Find General Information
    i = string.find(source, "General information")
    if (i != -1):
        source = searchForPlus(source, "General information")
        # Available?
        i = string.find(source, "Sold out.")
        #print "SOLD OUT", i
        if (i != -1 and i < 650):
            available = "N"

    # Find Comments - usually in the language of the book - FR/EN/NL
    tag = "<I>Notes</I>"
    i = string.find(source, tag)
    if (i != -1):
        source = searchForPlus(source, tag)
        i = string.find(source, "CLASS=SUBINFO>")
        if (i != -1 and i < 50):
            source = searchForPlus(source, "CLASS=SUBINFO>")
            i = string.find(source, "<")
            comments = stripText(source[0:i])
    else:
        tag = "<I>Beschrijving</I>"
        i = string.find(source, tag)
        if (i != -1):
            source = searchForPlus(source, tag)
            i = string.find(source, "CLASS=SUBINFO>")
            if (i != -1 and i < 50):
                source = searchForPlus(source, "CLASS=SUBINFO>")
                i = string.find(source, "</span>")
                comments = stripText(source[0:i])
        comments = string.replace(comments, "<br>", "\n")
        comments = string.replace(comments, "<b>", "")
        comments = string.replace(comments, "</b>", "")
        comments = string.replace(comments, "<i>", "")
        comments = string.replace(comments, "</i>", "")

    # now, recheck authors. For multi-author books....
    marker = "Contributors :"
    i = string.find(source, marker)
    if (i != -1):
        source = searchForPlus(source, marker)
        contributors = source
        marker = "Author"
        authorList = []

        # add original author if we found it earlier
        if author != "":
            authorList.append(author)
        while contributors != None:

            i = string.find(contributors, marker)
            if (i != -1):
                contributors = searchForPlus(contributors, marker)
                i = string.find(contributors, separator)
                if (i != -1 and i < 50):
                    contributors = searchForPlus(contributors, separator)
                    i = string.find(contributors, "</span>")
                    authors = removeAtags(contributors[:i])
                    authors = stripText(authors)
                    if author != "" and authors != author:
                        authorList.append(authors)
                    contributors = searchForPlus(contributors, "</span>")
                else:
                    break
            else:
                break
        if len(authorList) > 0:
            author = authorList[0]

        if len(authorList) > 1:
            author2 = authorList[1]

        if len(authorList) > 2:
            author3 = authorList[2]

        if len(authorList) > 3:
            author4 = authorList[3]

        if len(authorList) > 4:
            author5 = authorList[4]

        if len(authorList) > 5:
            author6 = authorList[5]

    if title != "" and author == "":
        author = "No Author"
try:
    extract()
finally:
    if os.path.exists("scrapers/userexit.py"):
        execfile("scrapers/userexit.py") in globals()
