# amazon.fr scraper
#
# Copyright  1999-2007 Readerware Corporation.  All Rights Reserved.

import  os
import  string

from    scrapers.scrapers import searchFor
from    scrapers.scrapers import searchForPlus
from    scrapers.scrapers import stripText


def extract():
    global title,author,isbn,publisher,format,first,signed,date,place
    global copies,rating,condition,category,read,pflag,eflag,value
    global comments,dateEntered,dataSource,cart,ordered
    global lccn,dewey,userNumber,copyDate,valueDate,location
    global series,pages,keywords,dimensions
    global user1,user2,user3,user4,user5,user6,user7,user8,user9,user10
    global author2,author3,author4,author5,author6
    global usedprice,usedcount,collectibleprice,collectiblecount
    global newprice,newcount,listprice,readinglevel,salesrank,available
    global buyerwaiting,editionNumber,weight,image
    global fullDateFormat,source


    # Defaults
    first               = "N"
    signed              = "N"
    read                = "N"
    pflag               = "Y"
    eflag               = "Y"
    ordered             = "N"
    usedprice           = ""
    usedcount           = ""
    collectibleprice    = ""
    collectiblecount    = ""
    newprice            = ""
    newcount            = ""
    listprice           = ""
    readinglevel        = ""
    salesrank           = ""
    available           = "Y"
    buyerwaiting        = "N"
    weight              = ""

    # No place extraction default to France
    # place = "France"


    # Find optional fields, pricing info etc.
    marketinfo = source


    # Main extraction


    # Find Image (first try)
    image = ""
    i = string.find(source, "registerImage(\"original_image\"")

    if i != -1:
        tempdata = searchForPlus(source, "registerImage(\"original_image\"")
        tempdata = searchForPlus(tempdata, "\"")
        i = string.find(tempdata, "\"")
        image = stripText(tempdata[0:i])

        i = string.find(image, "no-image")

        if i != -1:
            image = ""

        i = string.find(image, ",")

        if i != -1:
            j = string.rfind(image[i:], ",")
            image = stripText(image[0:i] + image[i+j+1:])

        i = string.find(image, "_AA")

        if i != -1:
            image = stripText(image[0:i] + "_SL" + image[i+3:])


    # Find Title
    source = searchForPlus(source, "\"btAsinTitle\"")
    source = searchForPlus(source, ">")
    i = string.find(source, "<")
    title = stripText(source[0:i])

    i = string.rfind(title, "(")

    if i != -1:
        title = stripText(title[0:i])


    # Find Author
    #source = searchForPlus(source, "author=")

    #i = string.find(source, "\n")
    #tempValue = stripText(source[0:i])
    #i = string.find(tempValue, "%2C")

    #if (i == -1):
    #    i = string.find(tempValue, "/")

    #authorlast = stripText(tempValue[0:i])
    #i = string.find(tempValue, "%2C%20")

    #if (i != -1):
    #    tempValue = searchForPlus(tempValue, "%2C%20")
    #    i = string.find(tempValue, "/")
    #    authorfirst = stripText(tempValue[0:i])
    #    author = stripText(authorlast) + ", " + stripText(authorfirst)
    #else:
    #    author = authorlast

    # Find Author
    i = string.find(source, "<a href=")
    
    haveAuthor = 1
    authorList = []

    if i == -1:
        haveAuthor = 0
    

    if (i > 5000):
        # Author too far away, Amazon sometimes does not list the book
        # author.
        haveAuthor = 0

    if haveAuthor == 1:
        source = searchFor(source, "\">")

        i = string.find(source, "\n")
        authors = stripText(source[0:i])

        while (searchFor(authors, "\">") != None):
            authors = searchForPlus(authors, "\">")
            i = string.find(authors, "\">")

            if i != -1:
                tempAuthor = stripText(authors[0:i])
            else :
                tempAuthor = stripText(authors)

            i = string.find(tempAuthor, "<")
            author = stripText(tempAuthor[0:i])
            i = string.rfind(author, " ")

            if (i != -1):
                author = stripText(author[i:]) + ", " + stripText(author[0:i])

            i = string.find(tempAuthor, "</a>")

            if i != -1:
                tempValue = searchFor(tempAuthor, "</a>")

                if tempValue[0:6] == "</a> (":
                    tempValue = searchForPlus(tempAuthor, "</a> ")
                    i = string.find(tempValue, ")")
                    author = author + " " + stripText(tempValue[0:i+1])

            i = string.find(author, "(Author)")

            if (i != -1):
                author = stripText(author[0:i])

            i = string.find(author, "(Auteur)")

            if (i != -1):
                author = stripText(author[0:i])

            authorList.append(author)


        if len(authorList) > 0:
            author = authorList[0]

        if len(authorList) > 1:
            author2 = authorList[1]

        if len(authorList) > 2:
            author3 = authorList[2]

        if len(authorList) > 3:
            author4 = authorList[3]

        if len(authorList) > 4:
            author5 = authorList[4]

        if len(authorList) > 5:
            author6 = authorList[5]

    if title != "" and len(authorList) == 0:
        author = "No Author"


    # Find Image
    if image == "":
        tag = "/images.amazon.com"
        i = string.find(source, tag)

        if i == -1 or i > 3000:
            tag = "images-amazon.com/"
            i = string.find(source, tag)

        if i != -1 and i < 3000:
            source = stripText(source[i-50:])
            temp = searchForPlus(source, "<img src=\"")
            i = string.find(temp, "\"")
            image = stripText(temp[0:i])

            i = string.find(image, ",")

            if i != -1:
                j = string.rfind(image, ",")
                k = string.find(image[j:], "_")
                image = stripText(image[0:i] + image[j+k:])

            i = string.find(image, "_PE")

            if i != -1:
                j = string.find(image[i+1:], "_")
                image = stripText(image[0:i] + image[i+j+1:])

            i = string.find(image, "ZZZ")

            if i == -1:
                image = ""
            

            # image fix-up, get rid of the standard "no image available"
            # as this adds nothing, and potentially stops real images being added
            j = string.find (image, "no-img-lg.gif")
            if (j != -1):
                image = ""
            else:
                j = string.find (image, "no-img-sm.gif")
                if (j != -1):
                    image = ""


    # Find Price
    i = string.find(source, "Prix &eacute;diteur")

    if (i == -1):
        i = string.find(source, ">Prix diteur:<")

    if (i == -1):
        i = string.find(source, ">Prix diteur:<")

    if (i == -1):
        i = string.find(source, "Notre prix :")

    if (i != -1):
        source = source[i:]
        i = string.find(source, "EUR ")
        source = source[i:]

        i = string.find(source, "<")
        value = stripText(source[0:i])
    else:
        # No price, maybe only title/author, check for isbn
        # and if that missing too, give up.
        i = string.find(source, "ISBN-10:")

        if i == -1:
            i = string.find(source, "ASIN:")
            # Let's try the ASIN instead
            if (i == -1):
            # Out of print, that's all the info available.
               return


    # Find Comments
    tag = ">Descriptions du produit<"
    i = string.find(source, tag)

    if i == -1:
        tag = ">reviews<"
        i = string.find(source, tag)

    if i == -1:
        tag = ">Reviews<"
        i = string.find(source, tag)

    if i != -1:
        comments = ""
        source = searchForPlus(source, tag)
        source = searchForPlus(source, "<div")
        source = searchForPlus(source, ">")
        i = string.find(source, "</div>")
        tempcomments = source[0:i]

        i = string.find(tempcomments, "</form>")

        if (i != -1):
            tempcomments = tempcomments[0:i]

        i = string.find(tempcomments, "<hr noshade")

        if (i != -1):
            tempcomments = tempcomments[0:i]

        i = string.find(tempcomments, "<font")

        if (i != -1):
            tempcomments = searchForPlus(tempcomments, "<font")

        i = string.find(tempcomments, "<span")

        if (i != -1):
            tempcomments = searchForPlus(tempcomments, "<span")

        tempcomments = searchForPlus(tempcomments, ">")
        i = string.find(tempcomments, "<span")

        if (i != -1):
            j = string.find(tempcomments[i:], ">")
            tempcomments = tempcomments[0:i] + tempcomments[i+j+1:]

        tempcomments = string.replace(tempcomments, "\n", "")

        while (searchFor(tempcomments, "  ") != None):
            i = string.find(tempcomments, "  ")
            tempcomments = tempcomments[0:i+1] + tempcomments[i+2:]

        while (searchFor(tempcomments, "<p> ") != None):
            i = string.find(tempcomments, "<p> ")
            tempcomments = tempcomments[0:i+3] + tempcomments[i+4:]

        while (searchFor(tempcomments, "<P> ") != None):
            i = string.find(tempcomments, "<P> ")
            tempcomments = tempcomments[0:i+3] + tempcomments[i+4:]

        while (searchFor(tempcomments, "<br> ") != None):
            i = string.find(tempcomments, "<br> ")
            tempcomments = tempcomments[0:i+4] + tempcomments[i+5:]

        while (searchFor(tempcomments, "<BR> ") != None):
            i = string.find(tempcomments, "<BR> ")
            tempcomments = tempcomments[0:i+4] + tempcomments[i+5:]

        tempcomments = string.replace(tempcomments, "</a>", "")
        tempcomments = string.replace(tempcomments, "</A>", "")
        tempcomments = string.replace(tempcomments, "<p>", "\n\n")
        tempcomments = string.replace(tempcomments, "<P>", "\n\n")
        tempcomments = string.replace(tempcomments, "<BR>", "\n")
        tempcomments = string.replace(tempcomments, "<br>", "\n")
        tempcomments = string.replace(tempcomments, "<br />", "\n")
        tempcomments = string.replace(tempcomments, "<i>", "")
        tempcomments = string.replace(tempcomments, "</i>", "")
        tempcomments = string.replace(tempcomments, "<I>", "")
        tempcomments = string.replace(tempcomments, "</I>", "")
        tempcomments = string.replace(tempcomments, "<b>", "")
        tempcomments = string.replace(tempcomments, "</b>", "")
        tempcomments = string.replace(tempcomments, "<em>", "")
        tempcomments = string.replace(tempcomments, "</em>", "")
        tempcomments = string.replace(tempcomments, "</font>", "")
        tempcomments = string.replace(tempcomments, "</span>", "")
        tempcomments = string.replace(tempcomments, "&copy;", "")
        tempcomments = string.replace(tempcomments, "&#145;", "'")
        tempcomments = string.replace(tempcomments, "&#169;", "")
        tempcomments = string.replace(tempcomments, "&#8217;", "'")
        tempcomments = string.replace(tempcomments, "&quot;", "\"")
        tempcomments = string.replace(tempcomments, "&#8211;", "-")
        tempcomments = string.replace(tempcomments, "&#146;", "\'")
        tempcomments = string.replace(tempcomments, "&#x9C;", "&#156;")
        tempcomments = string.replace(tempcomments, "&#xB0;", "&#176;")
        tempcomments = string.replace(tempcomments, "&#xC9;", "&#201;")
        tempcomments = string.replace(tempcomments, "&#xE0;", "&#224;")
        tempcomments = string.replace(tempcomments, "&#xE2;", "&#226;")
        tempcomments = string.replace(tempcomments, "&#xEE;", "&#238;")
        tempcomments = string.replace(tempcomments, "&#xEF;", "&#239;")
        tempcomments = string.replace(tempcomments, "&#xE7;", "&#231;")
        tempcomments = string.replace(tempcomments, "&#xE8;", "&#232;")
        tempcomments = string.replace(tempcomments, "&#xE9;", "&#233;")
        tempcomments = string.replace(tempcomments, "&#xEA;", "&#234;")
        tempcomments = string.replace(tempcomments, "&#xF4;", "&#244;")
        tempcomments = string.replace(tempcomments, "&#xF9;", "&#249;")
        tempcomments = string.replace(tempcomments, "&#xFB;", "&#251;")
        tempcomments = string.replace(tempcomments, "<blockquote>", "")
        tempcomments = string.replace(tempcomments, "</blockquote>", "")

        while (searchFor(tempcomments, "\n ") != None):
            i = string.find(tempcomments, "\n ")
            tempcomments = tempcomments[0:i+1] + tempcomments[i+2:]

        while (searchFor(tempcomments, "<a ") != None):
            i = string.find(tempcomments, "<a ")
            j = string.find(tempcomments[i:], ">")

            if j == -1:
                j = 2

            tempcomments = tempcomments[0:i] + tempcomments[i+j+1:]

        while (searchFor(tempcomments, "<A ") != None):
            i = string.find(tempcomments, "<A ")
            j = string.find(tempcomments[i:], ">")
            tempcomments = stripText(tempcomments[0:i] + tempcomments[i+j+1:])

        while (searchFor(tempcomments, "<img src") != None):
            i = string.find(tempcomments, "<img src")
            j = string.find(tempcomments[i:], ">")
            tempcomments = tempcomments[0:i] + tempcomments[i+j+1:]

        while (searchFor(tempcomments, "<span") != None):
            i = string.find(tempcomments, "<span")
            j = string.find(tempcomments[i:], ">")
            tempcomments = tempcomments[0:i] + tempcomments[i+j+1:]

        tempcomments = stripText(tempcomments);
        if tempcomments.endswith("See all reviews") == 1:
            i = string.rfind(tempcomments, "See all reviews")
            tempcomments = stripText(tempcomments[0:i])

        comments = tempcomments



    i = searchForPlus(source, "tails sur le produit")

    if i != -1:
        attributes = searchForPlus(source, "tails sur le produit")
        # first item of the description data; should be the format
        attributes = searchForPlus(attributes, "<li><b>")
        i = string.find(attributes, "</ul>")
        attributes = attributes[0:i]
        i = string.find(attributes, ":</b>")
        formatline = attributes[0:i]
        format = "Unknown Binding"

        i = searchForPlus(formatline, "Poche")
        if (i != None):
            format = "Mass Market Paperback"
       
        j = searchForPlus(formatline, "Broch")
        if (j != None):
            format = "Paperback"
       
        k = searchForPlus(formatline, "Cartonn")
        if (k != None):
            format = "Hardback"
       
        l = searchForPlus(formatline, "Album")
        if (l != None):
            format = "Hardback"
    
        if format != "Unknown Binding":
            # Find Pages
            attributes = searchForPlus(attributes, "</b>")
            i = string.find(attributes, "pages")
            if (i != -1):
                pages = stripText(attributes[0:i-1])

        # Find Publisher
        attributes = searchForPlus(attributes,"<b>Editeur")
        attributes = searchForPlus(attributes, ">")
        i = string.find(attributes, "<")
        publisher = stripText(attributes[0:i])

        i = string.find(publisher, "\n")

        if (i != -1):
            pub1 = stripText(publisher[0:i])
            pub2 = stripText(publisher[i+1:])
            publisher = pub1 + " " + pub2

        i = string.find(publisher, "\n")

        if (i != -1):
            publisher = stripText(publisher[0:i])

        i = string.find(publisher, "(")

        if i != -1:
            date = stripText(publisher[i+1:])
            publisher = stripText(publisher[0:i])
            i = string.find(date, ")")
            date = stripText(date[0:i])

            if fullDateFormat == "false":
                i = string.rfind(date, " ")

                if i != -1:
                    date = stripText(date[i+1:])

        i = string.find(publisher, ";")

        if i != -1:
            publisher = stripText(publisher[0:i])

        i = string.find(publisher, "&nbsp")

        if (i != -1):
            publisher = stripText(publisher[0:i])
                

    # Find series if any
    i = string.find(attributes, ">Collection&")

    if i != -1:
        attributes = searchForPlus(attributes, ">Collection&")
        attributes = searchForPlus(attributes, ">")
        i = string.find(attributes, "<")
        series = stripText(attributes[0:i])

    
    # Find ISBN
    i = string.find(attributes, "ISBN-10:")

    if i != -1:
        attributes = searchForPlus(attributes, "ISBN-10:")
        attributes = searchForPlus(attributes, ">")
        i = string.find(attributes, "<")
        isbn = stripText(attributes[0:i])

    # Find Dimensions
    i = string.find(source, "Dimensions (en&nbsp;cm)&nbsp;:")

    if i != -1:
       source = searchForPlus(source, "Dimensions (en&nbsp;cm)&nbsp;:")
       source = searchForPlus(source, ">")
       i = string.find(source, "<")
       # sometimes this is left blank. Only include for non-null
       if i>1:
           dimensions = stripText(source[0:i]) + " cm"

    # Find category
    i = string.find(source, "<b>Parcourir les")

    if (i != -1):
        source = searchForPlus(source, "<b>Parcourir les")
        source = searchForPlus(source, "\">")
        i = string.find(source, "<")
        category = stripText(source[0:i])


try:
    extract()
finally:
    if os.path.exists("scrapers/userexit.py"):
        execfile("scrapers/userexit.py") in globals()
