# audible.com audiobook scraper
#
# Copyright  1999-2007 Readerware Corporation.  All Rights Reserved.

import  os
import  string

from    com.readerware.http import HTTPConnection

from    scrapers.scrapers import searchFor
from    scrapers.scrapers import searchForPlus
from    scrapers.scrapers import stripText
from    scrapers.scrapers import stripHTML
from    scrapers.scrapers import stripNewLines


def extract():
    global title,author,isbn,publisher,format,first,signed,date,place
    global copies,rating,condition,category,read,pflag,eflag,value
    global comments,dateEntered,dataSource,cart,ordered
    global lccn,dewey,userNumber,copyDate,valueDate,location
    global series,pages,keywords,dimensions
    global user1,user2,user3,user4,user5,user6,user7,user8,user9,user10
    global author2,author3,author4,author5,author6
    global usedprice,usedcount,collectibleprice,collectiblecount
    global newprice,newcount,listprice,readinglevel,salesrank,available
    global buyerwaiting,editionNumber,weight,image
    global fullDateFormat,source


    # Defaults
    first               = "N"
    signed              = "N"
    read                = "N"
    pflag               = "Y"
    eflag               = "Y"
    ordered             = "N"
    usedprice           = ""
    usedcount           = ""
    collectibleprice    = ""
    collectiblecount    = ""
    newprice            = ""
    newcount            = ""
    listprice           = ""
    readinglevel        = ""
    salesrank           = ""
    available           = "Y"
    buyerwaiting        = "N"
    weight              = ""

    # Refresh page if necessary
    if string.find(source, "http-equiv=\"refresh\"") != -1:
        source = searchForPlus(source, "http-equiv=\"refresh\"")
        i = string.rfind(source, "URL=")
        source = stripText(source[i+4:])
        i = string.find(source, "\"")
        url = "http://www.audible.com" + stripText(source[0:i])
        http = HTTPConnection()
        http.resetReferer();
        http.blockForLoad();
        source = http.getContents(url)
        t2 = open("trace2.html", "w")
        t2.write(source)
        t2.close()


    # Get detail page if necessary
    i = string.find(source, ">See product details<")

    if i != -1:
        source = stripText(source[i-100:])
        source = searchForPlus(source, "href=\"")
        i = string.find(source, "\"")
        url = "http://www.audible.com" + stripText(source[0:i])
        http = HTTPConnection()
        http.resetReferer();
        http.blockForLoad();
        source = http.getContents(url)
        t3 = open("trace3.html", "w")
        t3.write(source)
        t3.close()



    # No place extraction default to US
    # place = "United States"


    # Find optional fields, pricing info etc.
    marketinfo = source


    # Find List price
    i = string.find(marketinfo, "Retail Price:")

    if (i != -1):
        listprice = searchForPlus(marketinfo, "Retail Price:")
        listprice = searchFor(listprice, "$")
        i = string.find(listprice, "<")
        listprice = stripText(listprice[0:i])


    # Main extraction
    i = string.find(source, "id=\"description")

    if i == -1:
        return;


    # Find Image
    i = string.find(source, "class=\"descThumb\"")

    if i != -1:
        image = searchForPlus(source, "class=\"descThumb\"")
        image = searchForPlus(image, "src='")
        i = string.find(image, "'")
        image = stripText("http://www.audible.com" + image[0:i])


    # Find Title
    source = searchForPlus(source, "class=\"descThumb\"")
    source = searchForPlus(source, "<div style=\"text-justify")
    source = searchForPlus(source, "\">")
    i = string.find(source, "<")
    title = stripText(source[0:i])


    # Find Author(s)
    authorList = []

    i = string.find(source, ">By:<")

    if i != -1:
        source = searchForPlus(source, ">By:<")
        i = string.find(source, "</span>")
        authors = stripText(source[0:i])

        while (searchFor(authors, "href=") != None):
            authors = searchForPlus(authors, "href=")
            authors = searchForPlus(authors, ">")
            i = string.find(authors, "<")
            author = stripText(authors[0:i])
            i = string.rfind(author, " ")

            if (i != -1):
                author = stripText(author[i:]) + ", " + stripText(author[0:i])

            authorList.append(author)


    # Find Narrator
    i = string.find(source, "Narrator:")

    if i != -1:
        narrator = searchForPlus(source, "Narrator:")
        narrator = searchForPlus(narrator, "href=")
        narrator = searchForPlus(narrator, ">")
        i = string.find(narrator, "<")
        narrator = stripText(narrator[0:i])
        i = string.rfind(narrator, " ")

        if (i != -1):
            narrator = stripText(narrator[i:]) + ", " + stripText(narrator[0:i])

        narrator = narrator + " (Narrator)"
        authorList.append(narrator)


    if len(authorList) > 0:
        author = authorList[0]

    if len(authorList) > 1:
        author2 = authorList[1]

    if len(authorList) > 2:
        author3 = authorList[2]

    if len(authorList) > 3:
        author4 = authorList[3]

    if len(authorList) > 4:
        author5 = authorList[4]

    if len(authorList) > 5:
        author6 = authorList[5]


    # Find Price
    i = string.find(source, "r Price:")

    if (i == -1):
        i = string.find(source, "Price:<")

    if i != -1:
        source = source[i:]
        source = searchFor(source, "$")
        i = string.find(source, "<")
        value = stripText(source[0:i])


    # Find Rating
    i = string.find(source, "class=\"newrating\"")

    if i != -1:
        rating = searchForPlus(source, "class=\"newrating\"")
        rating = searchForPlus(rating, ">")
        rating = searchForPlus(rating, ">")
        i = string.find(rating, "<br>")
        rating = stripText(rating[0:i])
        i = string.find(rating, "\"Av. Customer Rating:")

        if i != -1:
            rating = searchForPlus(rating, "\"Av. Customer Rating:")
            i = string.find(rating, "\"")
            rating = stripText(rating[0:i])
        else:
            rating = ""


    # Find Format
    i = string.find(source, ">Program Type:<")

    if i != -1:
        source = searchForPlus(source, ">Program Type:<")
        source = searchForPlus(source, ">")
        i = string.find(source, "<")
        format = stripText(source[0:i])
        i = string.find(format, "\n")

        if (i != -1):
            format = stripText(format[0:i]) 


    # Find Publisher
    i = string.find(source, ">Publisher:<")

    if i != -1:
        publisher = searchForPlus(source, ">Publisher:<")
        publisher = searchForPlus(publisher, "'>")
        i = string.find(publisher, "<br />")
        publisher = stripText(publisher[0:i])

        i = string.find(publisher, ", ")

        if i != -1:
            date = searchForPlus(publisher, ", ")
            publisher = stripText(publisher[0:i])

        i = string.find(publisher, "href")

        if i != -1:
            publisher = searchForPlus(publisher, ">")
            i = string.find(publisher, "</a>")
            publisher = stripText(publisher[0:i])


    # Find Dimensions (Length)
    i = string.find(source, ">Length:<")

    if i != -1:
        dimensions = searchForPlus(source, ">Length:<")
        dimensions = searchForPlus(dimensions, ">")
        i = string.find(dimensions, "<")
        dimensions = stripText(dimensions[0:i])


    # Find Comments
    comments = ""
    i = string.find(source, "<div class=\"publisherHeader\">")

    if i != -1:
        source = searchFor(source, "<div class=\"publisherHeader\">")
        i = string.find(source, "</td>")
        comments = stripText(source[0:i])

        while (searchFor(comments, "<div") != None):
            i = string.find(comments, "<div")
            j = string.find(comments[i:], ">")
            comments = comments[0:i] + "<BR><BR>" + comments[i+j+1:]

        while (searchFor(comments, "<p ") != None):
            i = string.find(comments, "<p ")
            j = string.find(comments[i:], ">")
            comments = comments[0:i] + comments[i+j+1:]

        while (searchFor(comments, "<a ") != None):
            i = string.find(comments, "<a ")
            j = string.find(comments[i:], ">")
            comments = comments[0:i] + comments[i+j+1:]

        while (searchFor(comments, "\n") != None):
            i = string.find(comments, "\n")
            comments = comments[0:i] + comments[i+1:]

        comments = string.replace(comments, "<p>\n", "\n\n")
        comments = string.replace(comments, "<P>", "\n\n")
        comments = string.replace(comments, "</P>", "")
        comments = string.replace(comments, "<p>", "\n\n")
        comments = string.replace(comments, "</p>", "")
        comments = string.replace(comments, "<BR>", "\n")
        comments = string.replace(comments, "<br>", "\n")
        comments = string.replace(comments, "<br />", "\n")
        comments = string.replace(comments, "<I>", "")
        comments = string.replace(comments, "</I>", "")
        comments = string.replace(comments, "<i>", "")
        comments = string.replace(comments, "</i>", "")
        comments = string.replace(comments, "<li>", "*")
        comments = string.replace(comments, "<h3>", "")
        comments = string.replace(comments, "</h3>", "")
        comments = string.replace(comments, "<b>", "")
        comments = string.replace(comments, "</b>", "")
        comments = string.replace(comments, "</div>", "\n")
        comments = string.replace(comments, "</font>", "")
        comments = string.replace(comments, "</a>", "")
        comments = string.replace(comments, "\t", " ")
        comments = string.replace(comments, "\n ", "\n")
        comments = string.replace(comments, "\n ", "\n")
        comments = string.replace(comments, "\n ", "\n")
        comments = string.replace(comments, "\n\n\n", "\n\n")
        comments = string.replace(comments, "\n\n\n", "\n\n")
        comments = string.replace(comments, "\n\n\n", "\n\n")
        comments = string.replace(comments, "\n\n\n", "\n\n")
        comments = string.replace(comments, "  ", " ")
        comments = string.replace(comments, "  ", " ")
        comments = string.replace(comments, "  ", " ")
        comments = string.replace(comments, "  ", " ")
        comments = string.replace(comments, "  ", " ")
        comments = string.replace(comments, "  ", " ")
        comments = string.replace(comments, "  ", " ")

        while (searchFor(comments, "\n ") != None):
            i = string.find(comments, "\n ")
            comments = comments[0:i+1] + comments[i+2:]

        while (searchFor(comments, "\r ") != None):
            i = string.find(comments, "\r ")
            comments = comments[0:i+1] + comments[i+2:]





try:
    extract()
finally:
    if os.path.exists("scrapers/userexit.py"):
        execfile("scrapers/userexit.py") in globals()
