# Internet Speculative Fiction Database scraper
#
#

import  os

from    com.readerware.http import HTTPConnection

from    scrapers.scrapers import convertAuthor
from    scrapers.scrapers import searchForPlus
from    scrapers.scrapers import stripNewLines

# Retrieve a field value given a 
def field_value(data, tag, keep_tags=None):
    tag = "<b>" + tag + "</b>"
    i = data.find(tag)
    if i == -1:
        return ""
    i += len(tag)
    j = i
    while not (data.startswith("<li>", j) or 
               data.startswith("</ul>", j) or
               data.startswith("<br>", j)):
        j = data.find('<', j + 1)
        if j == -1:
            return ""
        
    value = data[i:j]

    # Remove tags
    if not keep_tags:
        c = 0
        while c != -1:
            c = value.find('<', c)
            if c != -1:
                i = value.find('>', c)
                if i == -1:
                    return ""
                value = value[:c] + value[i + 1:]

    value = stripNewLines(value)

    return value

format_convert = {
    "hc": "Hardcover",
    "tp": "Paperback",
    "pb": "Paperback"
    }

def extract():
    global title,author,isbn,publisher,format,first,signed,date,place
    global copies,rating,condition,category,read,pflag,eflag,value
    global comments,dateEntered,dataSource,cart,ordered
    global lccn,dewey,userNumber,copyDate,valueDate,location
    global series,pages,keywords,dimensions
    global user1,user2,user3,user4,user5,user6,user7,user8,user9,user10
    global author2,author3,author4,author5,author6
    global usedprice,usedcount,collectibleprice,collectiblecount
    global newprice,newcount,listprice,readinglevel,salesrank,available
    global buyerwaiting,editionNumber,weight,image
    global fullDateFormat,source


    # Defaults
    first               = "N"
    signed              = "N"
    read                = "N"
    pflag               = "Y"
    eflag               = "Y"
    ordered             = "N"
    usedprice           = ""
    usedcount           = ""
    collectibleprice    = ""
    collectiblecount    = ""
    newprice            = ""
    newcount            = ""
    listprice           = ""
    readinglevel        = ""
    salesrank           = ""
    available           = ""
    buyerwaiting        = "N"
    weight              = ""

    # No place extraction default to US
    # place = "United States"

    i = source.find('<h2>Pub Search</h2>')
    if i != -1:
        try:
            # We're at a multiple item list.  Select the first item.
            i = source.index("/pl.cgi?")
            i = source.rindex('href="', 0, i)
            source = source[i+6:]
            i = source.index('"')
            source = source[:i]
            http = HTTPConnection()
            http.resetReferer()
            http.blockForLoad()
            source = http.getContents(source)
            t = open("trace.html", "w")
            t.write(source)
            t.close()
        except ValueError:
            return

    # Find Metadata
    i = source.find('<div id="MetadataBox">')
    j = source.find('<div id="VerificationBox">')
    if i == -1 or j == -1:
        return
    metadata = source[i:j]
    
    # Get Title
    title = field_value(metadata, "Title:")
    # Strip parenthesized extras
    if title[-1] == ')':
        i = title.rfind('(')
        if i != -1:
            # Set the series from parenthesized, overide later if
            # something better is found.
            series = title[i+1:-1]
            title = stripNewLines(title[:i])

    # Get Authors
    authors = field_value(metadata, "Authors:")
    if authors:
        authorList = authors.split(",")
        for i in range(len(authorList)):
            authorList[i] = convertAuthor(authorList[i].strip())
    
        if len(authorList) > 0:
            author = authorList[0]

        if len(authorList) > 1:
            author2 = authorList[1]

        if len(authorList) > 2:
            author3 = authorList[2]

        if len(authorList) > 3:
            author4 = authorList[3]

        if len(authorList) > 4:
            author5 = authorList[4]

        if len(authorList) > 5:
            author6 = authorList[5]

    # Get date
    date = field_value(metadata, "Year:")

    # Get ISBN
    isbn = field_value(metadata, "ISBN-10:")

    # Get Publisher
    publisher = field_value(metadata, "Publisher:")

    # Get List Price
    listprice = field_value(metadata, "Price:")

    # Get Pages
    pagelist = field_value(metadata, "Pages:").split("+")
    for page in pagelist:
        try:
            tmp = int(page)
            pages = page
            break
        except ValueError:
            tmp = None


    # Get Format
    binding = field_value(metadata, "Binding:")
    if binding:
        format = format_convert.get(binding, format)

    # Get Image
    try:
        i = metadata.index('alt="picture"')
        i = metadata.rindex("<img", 0, i)
        imageurl = searchForPlus(metadata[i:], 'src="')
        if imageurl:
            i = imageurl.index('"')
            image = imageurl[:i]
    except ValueError:
        image = image

    # Grab the Title Reference
    ref = field_value(metadata, "Title Reference:", 1)
    if ref == None:
        return
    i = ref.index('href="')
    ref = ref[i+6:]
    i = ref.index('"')
    ref = ref[:i]
    http = HTTPConnection()
    http.resetReferer()
    http.blockForLoad()
    source = http.getContents(ref)
    t = open("trace.html", "w")
    t.write(ref)
    t.close()
    
    # Find Data
    i = source.find('<div id="main">')
    if i == -1:
        return
    source = source[i:]

    # Now get series information
    betterseries = field_value(source, "Series:")
    if betterseries:
        series = betterseries
    
    # And number (for me)
    #user1 = field_value(source, "Series Number:")


try:
    extract()
finally:
    if os.path.exists("scrapers/userexit.py"):
        execfile("scrapers/userexit.py") in globals()
