# amazon.de scraper
#
# Copyright  1999-2007 Readerware Corporation.  All Rights Reserved.

import  os
import  string

from    com.readerware.http import HTTPConnection

from    scrapers.scrapers import convertAuthor
from    scrapers.scrapers import searchFor
from    scrapers.scrapers import searchForPlus
from    scrapers.scrapers import stripText
from    scrapers.scrapers import replaceHTML
from    scrapers.scrapers import replaceURL


def extract():
    global title,author,isbn,publisher,format,first,signed,date,place
    global copies,rating,condition,category,read,pflag,eflag,value
    global comments,dateEntered,dataSource,cart,ordered
    global lccn,dewey,userNumber,copyDate,valueDate,location
    global series,pages,keywords,dimensions
    global user1,user2,user3,user4,user5,user6,user7,user8,user9,user10
    global author2,author3,author4,author5,author6
    global usedprice,usedcount,collectibleprice,collectiblecount
    global newprice,newcount,listprice,readinglevel,salesrank,available
    global buyerwaiting,editionNumber,weight,image
    global fullDateFormat,source


    # Defaults
    first               = "N"
    signed              = "N"
    read                = "N"
    pflag               = "Y"
    eflag               = "Y"
    ordered             = "N"
    usedprice           = ""
    usedcount           = ""
    collectibleprice    = ""
    collectiblecount    = ""
    newprice            = ""
    newcount            = ""
    listprice           = ""
    readinglevel        = ""
    salesrank           = ""
    available           = "Y"
    buyerwaiting        = "N"
    weight              = ""
    
    
    # user1 = language
    # user1 = "Deutsch"

    # No place extraction default to US
    # place = "Germany"

    if source.startswith("<HTML><BODY><B>ERROR - URL: ") == 1:
        # Special characters in URL, translate and try again
    	source = searchForPlus(source, "<HTML><BODY><B>ERROR - URL: ")
        i = string.find(source, " ")
        source = stripText(source[0:i])
        source = replaceURL(source)
        http = HTTPConnection()
        http.resetReferer();
        http.blockForLoad();
        source = http.getContents(source)
        t2 = open("trace2.html", "w")
        t2.write(source)
        t2.close()

    # If Amazon lists other editions, remove them.
    # complicates price extraction
    i = string.find(source, ">Other Editions")

    if i != -1:
        j = string.find(source[i:], "</table")
        source = source[0:i+1] + source[i+j:]

    i = string.find(source, "td.otherEditions")

    if i != -1:
        j = string.find(source[i:], "</table")
        source = source[0:i+1] + source[i+j:]

    # Find optional fields, pricing info etc.
    marketinfo = source

    # new/used prices/count
    i = string.find(marketinfo,"more-buying-choices.gif")
    j = string.find(marketinfo,"sell-yours-here.gif")
    if (i != -1):
    	marketinfo = searchForPlus(marketinfo, "more-buying-choices.gif")
    	j = string.find(marketinfo,"sell-yours-here.gif")
    	i = string.find(marketinfo,"/exec/obidos/offering-page/")
    	if ((i != -1) and (j>i)):
    		marketinfo = searchForPlus(marketinfo, "<a href=")
    		marketinfo = searchForPlus(marketinfo, "/exec/obidos/offering-page/")
    		marketinfo = searchForPlus(marketinfo, ">")
    		i = string.find(marketinfo," Neu</")
    		if (i != -1):
    			newcount = stripText(marketinfo[0:i])
    			marketinfo = searchForPlus(marketinfo, "<font")
    			marketinfo = searchForPlus(marketinfo, ">")
    			i = string.find(marketinfo, "<")
    			newprice = replaceHTML(stripText(marketinfo[0:i]))
    			newprice = string.replace(newprice,"EUR","")
    	j = string.find(marketinfo,"sell-yours-here.gif")
    	i = string.find(marketinfo,"/exec/obidos/offering-page/")
    	if ((i != -1) and (j>i)):
    		marketinfo = searchForPlus(marketinfo, "<a href=")
    		marketinfo = searchForPlus(marketinfo, "/exec/obidos/offering-page/")
    		marketinfo = searchForPlus(marketinfo, ">")
    		i = string.find(marketinfo," Gebraucht</")
    		if (i != -1):
    			usedcount = stripText(marketinfo[0:i])
    			marketinfo = searchForPlus(marketinfo, "<font")
    			marketinfo = searchForPlus(marketinfo, ">")
    			i = string.find(marketinfo, "<")
    			usedprice = replaceHTML(stripText(marketinfo[0:i]))
    			usedprice = string.replace(usedprice,"EUR","")	
    
    # find availability
    i = string.find(marketinfo, "F&uuml;hren wir nicht oder nicht mehr")
    if (i != -1):
    	available = "N"
    

    # Main extraction


    # Find Image (first try)
    image = ""
    i = string.find(source, "registerImage(\"original_image\"")

    if i != -1:
        tempdata = searchForPlus(source, "registerImage(\"original_image\"")
        tempdata = searchForPlus(tempdata, "\"")
        i = string.find(tempdata, "\"")
        image = stripText(tempdata[0:i])

        i = string.find(image, "no-image")

        if i != -1:
            image = ""

        i = string.find(image, ",")

        if i != -1:
            j = string.rfind(image[i:], ",")
            image = stripText(image[0:i] + image[i+j+1:])

        i = string.find(image, "_AA")

        if i != -1:
            image = stripText(image[0:i] + "_SL" + image[i+3:])


    # Find Title
    source = searchForPlus(source, "\"btAsinTitle\"")
    source = searchForPlus(source, ">")
    i = string.find(source, "<")
    title = replaceHTML(stripText(source[0:i]))


    # Find Author
    authorList = []
    tag = "&field-keywords="
    i = string.find(source, tag)

    if i == -1 or i > 5000:
        tag = "&field-author="
        i = string.find(source, tag)

    if i != -1 and i <= 5000:
        source = searchFor(source, tag)

        i = string.find(source, "\n")
        authors = stripText(source[0:i])
        authorList = []

        while (searchFor(authors, tag) != None):
            authors = searchForPlus(authors, tag)
            authors = searchForPlus(authors, "\">")
            i = string.find(authors, "<")
            tempValue = stripText(authors[0:i])
            tempValue = replaceHTML(tempValue)
            tempValue = convertAuthor(tempValue)
            authorList.append(tempValue)


        if len(authorList) > 0:
            author = authorList[0]

        if len(authorList) > 1:
            author2 = authorList[1]

        if len(authorList) > 2:
            author3 = authorList[2]

        if len(authorList) > 3:
            author4 = authorList[3]

        if len(authorList) > 4:
            author5 = authorList[4]

        if len(authorList) > 5:
            author6 = authorList[5]

    if title != "" and len(authorList) == 0:
    	author = "(Herausg.)"


    # Find Image
    if image == "":
        i = string.find(source, "images.amazon.com/images/")

        if i == -1 or i > 2500:
            i = string.find(source, "images-amazon.com/images/")

        if i != -1 and i <= 2500:
            print "I=",i
            source = stripText(source[i-50:])
            temp = searchForPlus(source, "<img src=\"")
            i = string.find(temp, "\"")
            image = stripText(temp[0:i])

            i = string.find(image, ",")

            if i != -1:
                j = string.rfind(image, ",")
                k = string.find(image[j:], "_")
                image = stripText(image[0:i] + image[j+k:])

            i = string.find(image, "_PE")

            if i != -1:
                j = string.find(image[i+1:], "_")
                image = stripText(image[0:i] + image[i+j+1:])

            i = string.find(image, "ZZZ")

            if i == -1:
                image = ""


    # find listprice
    i = string.find(source,"Preisempfehlung&#42;:&#160;<")

    if i != -1:
    	source = searchForPlus(source,"Preisempfehlung&#42;:&#160;<")
    	source = searchForPlus(source,">")
    	i = string.find(source,"<")
    	listprice = replaceHTML(stripText(source[0:i]))

    # Find Price
    i = string.find(source, "Preis:<")

    if (i != -1):
        source = searchForPlus(source, "Preis:<")
        source = searchForPlus(source, "\">")
        i = string.find(source, "<")
        value = stripText(source[0:i])
        value = replaceHTML(string.replace(value, "EUR", ""))


    # Find Attributes
    attrtag = ">Produktinformation<"
    i = string.find(source, attrtag)

    if i != -1:
        attributes = searchFor(source, attrtag)
        i = string.find(attributes, "</ul>")
        attributes = stripText(attributes[0:i])

        # Find Format
        format = searchForPlus(attributes, "<b>")
        i = string.find(format, "<")
        format = stripText(format[0:i])
        i = string.find(format, ":")

        if i != -1:
            format = stripText(format[0:i])


        # Find Pages
        attributes = searchForPlus(attributes, "<b>")
        i = string.find(attributes, "Seiten")

        if (i != -1):
            attributes = stripText(attributes[i-10:])
            attributes = stripText(searchForPlus(attributes, ">"))

            i = string.find(attributes, " ")

            if (i != -1):
                pages = stripText(attributes[0:i])

                i = string.find(pages, " ")

                if (i != -1):
                    pages = stripText(pages[0:i])


        # Find Publication Date
        date = searchForPlus(attributes, "(")
        i = string.find(date, ")")
        date = replaceHTML(stripText(date[0:i]))

        # [Haai]
        # sometimes there is no date supplied, and the scraper hits the
        # bracket of Category(ies), giving date ies
        if date[0:3] == "ies":
            date = ""

        if fullDateFormat == "false":
            i = string.rfind(date, " ")

            if i != -1:
                date = stripText(date[i+1:])

        # Find Publisher
        i = string.find(attributes, ">Verlag:<")

        if i != -1:
            publisher = searchForPlus(attributes, ">Verlag:<")
            publisher = searchForPlus(publisher, ">")
            i = string.find(publisher, "<")
            publisher = replaceHTML(publisher[0:i])
            publisher = string.replace(publisher, "&amp;", "&")

            i = string.find(publisher, ";")

            if i != -1:
                publisher = stripText(publisher[0:i])

            i = string.find(publisher, "(")

            if i != -1:
                publisher = stripText(publisher[0:i])

        # Find ISBN
        i = string.find(attributes, "ISBN-10:")

        if (i != -1):
            isbn = searchForPlus(attributes, "ISBN-10:")
            isbn = searchForPlus(isbn, ">")
            i = string.find(isbn, "<")
            isbn = replaceHTML(isbn[0:i])
            i = string.find(isbn, " ")

            if i != -1:
                isbn = stripText(isbn[0:i])


        # sales ranking
        i = string.find(attributes, ">Amazon.de Verkaufsrang:<")

        if i != -1:
            salesrank = searchForPlus(attributes,">Amazon.de Verkaufsrang:<")
            salesrank = searchForPlus(salesrank,">")
            i = string.find(salesrank,"<")
            salesrank = stripText(salesrank[0:i])

            if salesrank.startswith("#") == 1:
                salesrank = searchForPlus(salesrank, "#")

            i = string.find(salesrank, " ")

            if i != -1:
                salesrank = stripText(salesrank[0:i])
    		

    # Find Comments
    tag = ">Produktbeschreibung"
    i = string.find(source, tag)

    if i == -1:
        tag = ">reviews<"
        i = string.find(source, tag)

    if i == -1:
        tag = ">Reviews<"
        i = string.find(source, tag)

    if i != -1:
        comments = ""
        source = searchForPlus(source, tag)
        source = searchForPlus(source, "<div")
        source = searchForPlus(source, ">")
        i = string.find(source, "</div>")
        tempcomments = source[0:i]

        i = string.find(tempcomments, "</form>")

        if (i != -1):
            tempcomments = tempcomments[0:i]

        i = string.find(tempcomments, "<hr noshade")

        if (i != -1):
            tempcomments = tempcomments[0:i]

        i = string.find(tempcomments, "<font")

        if (i != -1):
            tempcomments = searchForPlus(tempcomments, "<font")

        i = string.find(tempcomments, "<span")

        if (i != -1):
            tempcomments = searchForPlus(tempcomments, "<span")

        tempcomments = searchForPlus(tempcomments, ">")
        i = string.find(tempcomments, "<span")

        if (i != -1):
            j = string.find(tempcomments[i:], ">")
            tempcomments = tempcomments[0:i] + tempcomments[i+j+1:]

        i = string.find(tempcomments, "... <")

        if (i != -1):
            tempcomments = stripText(tempcomments[0:i+3])

        tempcomments = string.replace(tempcomments, "\n", "")

        while (searchFor(tempcomments, "  ") != None):
            i = string.find(tempcomments, "  ")
            tempcomments = tempcomments[0:i+1] + tempcomments[i+2:]

        while (searchFor(tempcomments, "<p> ") != None):
            i = string.find(tempcomments, "<p> ")
            tempcomments = tempcomments[0:i+3] + tempcomments[i+4:]

        while (searchFor(tempcomments, "<P> ") != None):
            i = string.find(tempcomments, "<P> ")
            tempcomments = tempcomments[0:i+3] + tempcomments[i+4:]

        while (searchFor(tempcomments, "<br> ") != None):
            i = string.find(tempcomments, "<br> ")
            tempcomments = tempcomments[0:i+4] + tempcomments[i+5:]

        while (searchFor(tempcomments, "<BR> ") != None):
            i = string.find(tempcomments, "<BR> ")
            tempcomments = tempcomments[0:i+4] + tempcomments[i+5:]

        tempcomments = string.replace(tempcomments, "</a>", "")
        tempcomments = string.replace(tempcomments, "</A>", "")
        tempcomments = string.replace(tempcomments, "<p>", "\n\n")
        tempcomments = string.replace(tempcomments, "<P>", "\n\n")
        tempcomments = string.replace(tempcomments, "<BR>", "\n")
        tempcomments = string.replace(tempcomments, "<br>", "\n")
        tempcomments = string.replace(tempcomments, "<br />", "\n")
        tempcomments = string.replace(tempcomments, "<i>", "")
        tempcomments = string.replace(tempcomments, "</i>", "")
        tempcomments = string.replace(tempcomments, "<I>", "")
        tempcomments = string.replace(tempcomments, "</I>", "")
        tempcomments = string.replace(tempcomments, "<b>", "")
        tempcomments = string.replace(tempcomments, "</b>", "")
        tempcomments = string.replace(tempcomments, "<em>", "")
        tempcomments = string.replace(tempcomments, "</em>", "")
        tempcomments = string.replace(tempcomments, "</font>", "")
        tempcomments = string.replace(tempcomments, "</span>", "")
        tempcomments = string.replace(tempcomments, "&copy;", "")
        tempcomments = string.replace(tempcomments, "&#145;", "'")
        tempcomments = string.replace(tempcomments, "&#169;", "")
        tempcomments = string.replace(tempcomments, "&#8217;", "'")
        tempcomments = string.replace(tempcomments, "&quot;", "\"")
        tempcomments = string.replace(tempcomments, "&#8211;", "-")
        tempcomments = string.replace(tempcomments, "&#146;", "\'")
        tempcomments = string.replace(tempcomments, "<blockquote>", "")
        tempcomments = string.replace(tempcomments, "</blockquote>", "")

        while (searchFor(tempcomments, "\n ") != None):
            i = string.find(tempcomments, "\n ")
            tempcomments = tempcomments[0:i+1] + tempcomments[i+2:]

        while (searchFor(tempcomments, "<a ") != None):
            i = string.find(tempcomments, "<a ")
            j = string.find(tempcomments[i:], ">")

            if j == -1:
                j = 2

            tempcomments = tempcomments[0:i] + tempcomments[i+j+1:]

        while (searchFor(tempcomments, "<A ") != None):
            i = string.find(tempcomments, "<A ")
            j = string.find(tempcomments[i:], ">")
            tempcomments = stripText(tempcomments[0:i] + tempcomments[i+j+1:])

        while (searchFor(tempcomments, "<img src") != None):
            i = string.find(tempcomments, "<img src")
            j = string.find(tempcomments[i:], ">")
            tempcomments = tempcomments[0:i] + tempcomments[i+j+1:]

        while (searchFor(tempcomments, "<span") != None):
            i = string.find(tempcomments, "<span")
            j = string.find(tempcomments[i:], ">")
            tempcomments = tempcomments[0:i] + tempcomments[i+j+1:]

        tempcomments = stripText(tempcomments);
        if tempcomments.endswith("See all reviews") == 1:
            i = string.rfind(tempcomments, "See all reviews")
            tempcomments = stripText(tempcomments[0:i])

        comments = tempcomments
    
    

try:
    extract()
finally:
    if os.path.exists("scrapers/userexit.py"):
        execfile("scrapers/userexit.py") in globals()
