# Amazon past purchases book list scraper
#
# Copyright  1999-2007 Readerware Corporation.  All Rights Reserved.

import  os
import  string

from    com.readerware.http import HTTPConnection

from    scrapers.scrapers import searchFor
from    scrapers.scrapers import searchForPlus
from    scrapers.scrapers import stripText
from    jarray import zeros, array
from    java.lang import String


def extract():
    global itemNames,itemLinks
    global source
    global names,links

    names = []
    links = []

    i = string.find(source, ">See more:<")

    if i == -1:
        return

    # Create trace files
    try:
        t2 = open("trace2.html", "w")
        t2.write("Orders by year\n\n")
        t2.close()
    except:
        print "Unable to create trace2.html"

    try:
        t3 = open("trace3.html", "w")
        t3.write("Individiual orders\n\n")
        t3.close()
    except:
        print "Unable to create trace3.html"


    url = searchForPlus(source, "<form action=\"")
    i = string.find(url, "\"")
    url = "http://www.amazon.com" + stripText(url[0:i]) + "/?POST=?opt=ab&groupID=0&orderFilter="

    years = searchForPlus(source, ">See more:<")
    i = string.find(years, "</select>")
    years = stripText(years[0:i])

    # Loop through years to get orders per year
    while (searchFor(years, "value=\"year-") != None):
        years = searchFor(years, "value=\"year-")
        years = searchForPlus(years, "\"")
        
        i = string.find(years, "\"")
        orderFilter = stripText(years[0:i])

        http = HTTPConnection()
        http.resetReferer();
        http.blockForLoad();
        year = http.getContents(url + orderFilter)

        try:
            t2 = open("trace2.html", "a")
            t2.write(year)
            t2.close()
        except:
            print "Unable to write to trace2.html"

        # Loop through this year to get individual orders
        orderCount = 0

        while (searchFor(year, "alt=\"View order\"") != None):
            i = string.find(year, "alt=\"View order\"")
            orders = stripText(year[i-250:])
            orders = searchForPlus(orders, "href=\"")

            year = searchForPlus(year, "alt=\"View order\"")

            i = string.find(orders, "\"")
            orders = stripText(orders[0:i])

            if orders.startswith("http") == 1:
                # z-shop, can't get those
                continue

            orders = "http://www.amazon.com" + orders

            http = HTTPConnection()
            http.resetReferer();
            http.blockForLoad();
            order = http.getContents(orders)
            errorCount = 0

            try:
                t3 = open("trace3.html", "a")
                t3.write(order)
                t3.close()
            except:
                print "Unable to write to trace3.html"

            while (searchFor(order, "<title>We're sorry!</title>") != None):
                # Amazon sometimes has an internal error when fetching the order,
                # try a few times it should clear up.
                print "Amazon internal error encountered"
                errorCount = errorCount + 1

                if errorCount > 5:
                    break

                http.blockForLoad();
                order = http.getContents(orders)

                try:
                    t3 = open("trace3.html", "a")
                    t3.write("Amazon internal error encountered")
                    t3.write(order)
                    t3.close()
                except:
                    print "Unable to write to trace3.html"

            while (searchFor(order, "Order Placed:") == None):
                # Amazon sometimes displays an empty order list
                # try a few times it should clear up.
                print "Amazon empty order list encountered"
                errorCount = errorCount + 1

                if errorCount > 5:
                    break

                http.blockForLoad();
                order = http.getContents(orders)

                try:
                    t3 = open("trace3.html", "a")
                    t3.write("Amazon empty order encountered")
                    t3.write(order)
                    t3.close()
                except:
                    print "Unable to write to trace3.html"

            if errorCount > 5:
                continue;

            extractOrder(order)
            orderCount = orderCount + 1

            if string.find(year, "alt=\"View order\"") == -1:
                print "No more orders on page, OC=", orderCount
                # Last order on this page, check for more pages
                while (searchFor(year, "&startAtIndex=") != None):
                    startAt = searchForPlus(year, "&startAtIndex=")
                    i = string.find(startAt, "&")
                    startAt = stripText(startAt[0:i])
                    print "startAt=",startAt

                    if int(startAt) == orderCount:
                        i = string.find(year, "&startAtIndex=")
                        next = stripText(year[i-70:])
                        next = searchForPlus(next, "href=\"")
                        i = string.find(next, "\"")
                        next = stripText(next[0:i])

                        http = HTTPConnection()
                        http.resetReferer();
                        http.blockForLoad();
                        year = http.getContents("http://www.amazon.com" + next)

                        try:
                            t2 = open("trace2.html", "a")
                            t2.write(year)
                            t2.close()
                        except:
                            print "Unable to write to trace2.html"

                        break

                    year = searchForPlus(year, "&startAtIndex=")

    itemNames = array(names, String)
    itemLinks = array(links, String)



def extractOrder(order):
    global names,links

    order = searchForPlus(order, "order number:")
    i = string.find(order, ">Need Help?<")
    order = stripText(order[0:i])

    # Find item names/links
    while (searchFor(order, "/gp/product/") != None):
        i = string.find(order, "/gp/product/")
        order = stripText(order[i-50:])

        link = searchForPlus(order, "href=\"")
        i = string.find(link, "\"")
        link = stripText(link[0:i])

        name = searchForPlus(order, "href=\"")
        name = searchForPlus(name, ">")
        i = string.find(name, "<")
        name = stripText(name[0:i])
        print "FOUND: ", name

        format = searchForPlus(order, "href=\"")
        format = searchForPlus(format, ">")
        format = searchForPlus(format, "<")

        i = string.find(format, "[")

        if i != -1 and i < 50:
            format = searchForPlus(format, "[")
            i = string.find(format, "]")
            format = stripText(format[0:i])
            print "FOUND: ", name,"-",format

            if format == "DVD" or format == "VHS Tape" or format == "Blu-ray" or format == "HD DVD":
                print "SELECTED: ", name,"-",format
                links.append(link)
                names.append(name)

        order = searchForPlus(order, "/gp/product/")
        




try:
    extract()
finally:
    pass
