# Library of Congress scraper
#
# Copyright  1999-2007 Readerware Corporation.  All Rights Reserved.

import  os
import  string

from    com.readerware.http import HTTPConnection

from    scrapers.scrapers import searchFor
from    scrapers.scrapers import searchForPlus
from    scrapers.scrapers import stripText
from    scrapers.scrapers import stripTelnetCodes


def extract():
    global title,author,isbn,publisher,format,first,signed,date,place
    global copies,rating,condition,category,read,pflag,eflag,value
    global comments,dateEntered,dataSource,cart,ordered
    global lccn,dewey,userNumber,copyDate,valueDate,location
    global series,pages,keywords,dimensions
    global user1,user2,user3,user4,user5,user6,user7,user8,user9,user10
    global author2,author3,author4,author5,author6
    global usedprice,usedcount,collectibleprice,collectiblecount
    global newprice,newcount,listprice,readinglevel,salesrank,available
    global buyerwaiting,editionNumber,weight,image
    global fullDateFormat,source
    global callnumber


    # Defaults
    first               = "N"
    signed              = "N"
    read                = "N"
    pflag               = "Y"
    eflag               = "Y"
    ordered             = "N"
    usedprice           = ""
    usedcount           = ""
    collectibleprice    = ""
    collectiblecount    = ""
    newprice            = ""
    newcount            = ""
    listprice           = ""
    readinglevel        = ""
    salesrank           = ""
    available           = "Y"
    buyerwaiting        = "N"
    weight              = ""


    # Find optional fields, pricing info etc.
    marketinfo = source


    # Main extraction
    # Determine page format
    i = string.find(source, "view1a.gif")

    if (i != -1):
        source = searchForPlus(source, "view2.gif")
        source = searchForPlus(source, "<A HREF=\"")
        i = string.find(source, "\"")
        url = "http://catalog.loc.gov" + stripText(source[0:i])
        http = HTTPConnection()
        http.resetReferer();
        http.blockForLoad();
        source = http.getContents(url)

    i = string.find(source, "view3a.gif")

    if (i != -1):
        extractFull()
        return


    # Find Author (it is possible to have no author line, in
    # which case I will assume "Various", i.e. a collection with no
    # specific author
    author = "Various"

    i = string.find(source, "Author:")

    if (i != -1):
        source = searchForPlus(source, "Author:")
        i = string.find(source, "\n")
        author = stripText(source[0:i])

        # Remove , pseud if present.
        if (string.count(author, ",") > 1):
            i = string.find(author, ",")
            i = string.find(author, ",", i+1)
            author = author[0:i]

        # Remove trailing period, if present
        i = string.rfind(author, ".")

        if (i != -1):
            x = i - 2

            if (x >= 2):
                if (author[x:x+1] != ' '):
                    if (len(author) == (i + 1)):
                        author = author[0:i]
            else:
                if (len(author) == (i + 1)):
                    author = author[0:i]


    # Find Title
    source = searchForPlus(source, "Title:     ")
    i = string.find(source, "\n")
    title = stripText(source[0:i])
    source = source[i+1:]

    while (source[0:5] == "     "):
        i = string.find(source, "\n")
        title = title + " " + stripText(source[0:i])
        source = source[i+1:]

    i = string.find(title, " / ")

    if (i != -1):
        title = stripText(title[0:i])

    # Remove Translated by, if present.
    i = string.find(title, ". Translated")

    if (i != -1):
        title = title[0:i]

    # Remove trailing period, if present
    i = string.rfind(title, ".")

    if (i != -1):
        if (len(title) == (i + 1)):
            title = title[0:i]


    # Find Place
    source = searchForPlus(source, "Published:")
    i = string.find(source, "\n")
    place = stripText(source[0:i])
    i = string.find(place, " : ")

    if (i != -1):
        place = stripText(place[0:i])
        i = string.find(source, " : ")
        source = source[i+2:]
    else:
        i = string.find(place, ", ")

        if (i != -1):
            place = stripText(place[0:i])
            i = string.find(source, ", ")
            source = source[i+1:]


    # Find Publisher
    i = string.find(source, "\n")
    templine = stripText(source[0:i])

    i = string.find(templine, " : ")

    if (i != -1):
        templine = stripText(templine[i+2:])

    i = string.rfind(templine, ",")

    if (i != -1):
        publisher = stripText(templine[0:i])
        templine = templine[i+1:]
    else:
        i = string.rfind(templine, "[")

        if (i != -1):
            publisher = stripText(templine[0:i])
            templine = templine[i:]
        else:
            publisher = templine


    # Find Publication Date
    i = string.find(templine, ".")

    if (i != -1):
        date = stripTelnetCodes(templine[0:i])
    else:
        date = templine

    i = string.find(date, "[")

    if (i != -1):
        date = stripText(date[i+1:])

    i = string.find(date, "]")

    if (i != -1):
        date = stripText(date[0:i])

    # if date in format cyyyy, strip c
    if (len(date) == 5):
        i = string.find(date, "c")

        if (i == 0):
            date = stripText(date[1:])

    if fullDateFormat == "false":
        i = string.rfind(date, " ")

        if i != -1:
            date = stripText(date[i+1:])


    # Find Pages
    i = string.find(source, "Description:")

    if i != -1:
        source = searchForPlus(source, "Description:")
        i = string.find(source, "\n")
        descline = stripText(source[0:i])
        i = string.find(descline, "p.")

        if (i != -1):
            pages = stripText(descline[0:i])

            i = string.rfind(pages, " ")

            if (i != -1):
                pages = stripText(pages[i:])


        # Find Dimensions
        dimensions = descline
        i = string.find(dimensions, ";")

        if (i != -1):
            dimensions = stripText(dimensions[i+1:])

        if dimensions == "cm.":
            dimensions = ""


    # Find Series
    i = string.find(source, "Series:  ")

    if (i != -1):
        source = searchForPlus(source, "Series:  ")
        i = string.find(source, "\n")
        series = stripText(source[0:i])
        source = source[i+1:]

        while (source[0:5] == "     "):
            i = string.find(source, "\n")
            series = series + " " + stripText(source[0:i])
            source = source[i+1:]


    # Find LOC Call Number
    i = string.find(source, "LC Call No.:")

    if (i != -1):
        source = searchForPlus(source, "LC Call No.:")
        i = string.find(source, "\n")
        callnumber = stripText(source[0:i])


    # Find Dewey Number
    i = string.find(source, "Dewey No.:")

    if (i != -1):
        source = searchForPlus(source, "Dewey No.:")
        i = string.find(source, "\n")
        dewey = stripText(source[0:i])


    # Find ISBN
    i = string.find(source, "ISBN:")

    if (i != -1):
        source = searchForPlus(source, "ISBN:")
        i = string.find(source, "\n")
        isbn = stripText(source[0:i])


        # Find format (only look for paperback)
        i = string.find(isbn, "pbk.")

        if (i != -1):
            format = "Paperback"
        else:
            format = "Hardcover"

        # Find Value (and remove from isbn)
        i = string.find(isbn, ":")

        if (i != -1):
            value = stripText(isbn[i+1:])
            value = stripTelnetCodes(value)
            isbn = stripText(isbn[0:i])

            i = string.find(value, "(")

            if (i != -1):
                value = stripText(value[0:i])


    # Find Category
    source = searchForPlus(source, "Subjects:")

    if (source != None):
        i = string.find(source, "\n")
        category = stripText(source[0:i])
        i = string.rfind(category, ".")

        if (i != -1):
            category = stripText(category[0:i])


def extractBrief():
    global title,author,isbn,publisher,format,first,signed,date,place
    global copies,rating,condition,category,read,pflag,eflag,value
    global comments,dateEntered,dataSource,cart,ordered
    global lccn,dewey,userNumber,copyDate,valueDate,location
    global series,pages,keywords,dimensions
    global user1,user2,user3,user4,user5,user6,user7,user8,user9,user10
    global author2,author3,author4,author5,author6
    global image,fullDateFormat,source
    global callnumber


    # Find Title
    source = searchForPlus(source, "view1a.gif")
    source = searchForPlus(source, "<STRONG>")
    i = string.find(source, "<")
    title = stripText(source[0:i])
    i = string.find(title, " / ")

    if (i != -1):
        title = stripText(title[0:i])

    # Remove Translated by, if present.
    i = string.find(title, ". Translated")

    if (i != -1):
        title = title[0:i]

    # Remove trailing period, if present
    i = string.rfind(title, ".")

    if (i != -1):
        if (len(title) == (i + 1)):
            title = title[0:i]


    # Find Author (it is possible to have no author line, in
    # which case I will assume "Various", i.e. a collection with no
    # specific author
    author = "Various"

    source = searchForPlus(source, ">Brief Description:<")
    i = string.find(source, "SC=Author")

    if (i != -1):
        source = searchForPlus(source, "SC=Author")
        i = string.find(source, "<BR>")
        authorTemp = stripText(source[0:i])
        author = ""

        i = string.rfind(authorTemp, "<STRONG><I>")

        if (i != -1):
            while (searchFor(authorTemp, "<STRONG><I>") != None):
                authorTemp = searchForPlus(authorTemp, "<STRONG><I>")
                i = string.find(authorTemp, "<")
                author = stripText(author + " " + authorTemp[0:i])
        else:
            authorTemp = searchForPlus(authorTemp, ">")
            i = string.find(authorTemp, "<")
            author = stripText(author + " " + authorTemp[0:i])

        # Remove , pseud if present.
        if (string.count(author, ",") > 1):
            i = string.find(author, ",")
            i = string.find(author, ",", i+1)
            author = author[0:i]

        # Remove trailing period, if present
        i = string.rfind(author, ".")

        if (i != -1):
            x = i - 2

            if (x >= 2):
                if (author[x:x+1] != ' '):
                    if (len(author) == (i + 1)):
                        author = author[0:i]
            else:
                if (len(author) == (i + 1)):
                    author = author[0:i]

        if author[len(author) - 1] == ",":
            author = author[0: len(author)-1]


    # Find Place
    source = searchForPlus(source, "<BR>")
    source = searchForPlus(source, "<BR>")

    if string.find(source[0:30], " : ") == -1 and string.find(source[0:30], ", ") == -1:
        source = searchForPlus(source, "<BR>")

    i = string.find(source, "<BR>")
    place = stripText(source[0:i])
    i = string.find(place, " : ")

    if (i != -1):
        place = stripText(place[0:i])
        i = string.find(source, " : ")
        source = source[i+2:]
    else:
        i = string.find(place, ", ")

        if (i != -1):
            place = stripText(place[0:i])
            i = string.find(source, ", ")
            source = source[i+1:]


    # Find Publisher
    i = string.find(source, "\n")
    templine = stripText(source[0:i])

    i = string.find(templine, " : ")

    if (i != -1):
        templine = stripText(templine[i+2:])

    i = string.rfind(templine, ",")

    if (i != -1):
        publisher = stripText(templine[0:i])
        templine = templine[i+1:]
    else:
        i = string.rfind(templine, "[")

        if (i != -1):
            publisher = stripText(templine[0:i])
            templine = templine[i:]
        else:
            publisher = templine


    # Find Publication Date
    i = string.find(templine, ".")

    if (i != -1):
        date = stripText(templine[0:i])
    else:
        date = templine

    i = string.find(date, "[")

    if (i != -1):
        date = stripText(date[i+1:])

    i = string.find(date, "]")

    if (i != -1):
        date = stripText(date[0:i])

    # if date in format cyyyy, strip c
    if (len(date) == 5):
        i = string.find(date, "c")

        if (i == 0):
            date = stripText(date[1:])
            copyDate = date

    i = string.find(date, "<BR>")

    if (i != -1):
        date = stripText(date[0:i])

    if fullDateFormat == "false":
        i = string.rfind(date, " ")

        if i != -1:
            date = stripText(date[i+1:])


    # Find Dimensions
    source = searchForPlus(source, "<BR>")
    i = string.find(source, "<")
    dimensions = stripText(source[0:i])


    # Find LOC Call Number
    i = string.find(source, ">CALL NUMBER:<")

    if (i != -1):
        source = searchForPlus(source, ">CALL NUMBER:<")
        source = searchForPlus(source, "\">")
        i = string.find(source, "<")
        callnumber = stripText(source[0:i])


def extractFull():
    global title,author,isbn,publisher,format,first,signed,date,place
    global copies,rating,condition,category,read,pflag,eflag,value
    global comments,dateEntered,dataSource,cart,ordered
    global lccn,dewey,userNumber,copyDate,valueDate,location
    global series,pages,keywords,dimensions
    global user1,user2,user3,user4,user5,user6,user7,user8,user9,user10
    global author2,author3,author4,author5,author6
    global image,fullDateFormat,source
    global callnumber


    # Find Title
    source = searchForPlus(source, "view3a.gif")
    source = searchForPlus(source, "<STRONG>")
    i = string.find(source, "<")
    title = stripText(source[0:i])
    i = string.find(title, " / ")

    if (i != -1):
        title = stripText(title[0:i])

    # Remove Translated by, if present.
    i = string.find(title, ". Translated")

    if (i != -1):
        title = title[0:i]

    # Remove trailing period, if present
    i = string.rfind(title, ".")

    if (i != -1):
        if (len(title) == (i + 1)):
            title = title[0:i]


    # Find LCCN
    i = string.find(source, ">LC Control Number:<")

    if (i != -1):
        source = searchForPlus(source, ">LC Control Number:<")
        source = searchForPlus(source, "<TD>")
        i = string.find(source, "<")
        lccn = stripText(source[0:i])




    # Find Author (it is possible to have no author line, in
    # which case I will assume "Various", i.e. a collection with no
    # specific author
    author = "Various"

    i = string.find(source, ">Personal Name:<")

    if (i != -1):
        source = searchForPlus(source, ">Personal Name:<")
        source = searchForPlus(source, "<TD>")
        i = string.find(source, "</TD>")
        authorTemp = stripText(source[0:i])
        author = ""

        i = string.rfind(authorTemp, "<STRONG><I>")

        if (i != -1):
            while (searchFor(authorTemp, "<STRONG><I>") != None):
                authorTemp = searchForPlus(authorTemp, "<STRONG><I>")
                i = string.find(authorTemp, "<")
                author = stripText(author + " " + authorTemp[0:i])
        else:
            authorTemp = searchForPlus(authorTemp, "HREF=")
            authorTemp = searchForPlus(authorTemp, ">")
            i = string.find(authorTemp, "<")
            author = stripText(author + " " + authorTemp[0:i])

        # Remove , pseud if present.
        if (string.count(author, ",") > 1):
            i = string.find(author, ",")
            i = string.find(author, ",", i+1)
            author = author[0:i]

        # Remove trailing period, if present
        i = string.rfind(author, ".")

        if (i != -1):
            x = i - 2

            if (x >= 2):
                if (author[x:x+1] != ' '):
                    if (len(author) == (i + 1)):
                        author = author[0:i]
            else:
                if (len(author) == (i + 1)):
                    author = author[0:i]

        if author[len(author) - 1] == ",":
            author = author[0: len(author)-1]


    # Find Place
    i = string.find(source, ">Published/Created:<")

    if (i != -1):
        source = searchForPlus(source, ">Published/Created:<")
        source = searchForPlus(source, "<TD>")
        i = string.find(source, "</TD>")
        place = stripText(source[0:i])
        i = string.find(place, " : ")

        if (i != -1):
            place = stripText(place[0:i])
            i = string.find(source, " : ")
            source = source[i+2:]
        else:
            i = string.find(place, ", ")

            if (i != -1):
                place = stripText(place[0:i])
                i = string.find(source, ", ")
                source = source[i+1:]


        # Find Publisher
        i = string.find(source, "</TD>")
        templine = stripText(source[0:i])

        i = string.find(templine, " : ")

        if (i != -1):
            templine = stripText(templine[i+2:])

        i = string.rfind(templine, ",")

        if (i != -1):
            publisher = stripText(templine[0:i])
            templine = templine[i+1:]
        else:
            i = string.rfind(templine, "[")

            if (i != -1):
                publisher = stripText(templine[0:i])
                templine = templine[i:]
            else:
                publisher = templine


        # Find Publication Date
        i = string.find(templine, ".")

        if (i != -1):
            date = stripTelnetCodes(templine[0:i])
        else:
            date = templine

        i = string.find(date, "[")

        if (i != -1):
            date = stripText(date[i+1:])

        i = string.find(date, "]")

        if (i != -1):
            date = stripText(date[0:i])

        # if date in format cyyyy, strip c
        if (len(date) == 5):
            i = string.find(date, "c")

            if (i == 0):
                date = stripText(date[1:])
                copyDate = date

        if fullDateFormat == "false":
            i = string.rfind(date, " ")

            if i != -1:
                date = stripText(date[i+1:])


    # Find Pages
    i = string.find(source, ">Description:<")

    if i != -1:
        source = searchForPlus(source, ">Description:<")
        source = searchForPlus(source, "<TD>")
        i = string.find(source, "</TD>")
        descline = stripText(source[0:i])
        i = string.find(descline, "p.")

        if (i != -1):
            pages = stripText(descline[0:i])

            i = string.rfind(pages, " ")

            if (i != -1):
                pages = stripText(pages[i:])


        # Find Dimensions
        dimensions = descline
        i = string.find(dimensions, ";")

        if (i != -1):
            dimensions = stripText(dimensions[i+1:])

        if dimensions == "cm.":
            dimensions = ""


    # Find ISBN
    i = string.find(source, ">ISBN:<")

    if (i != -1):
        source = searchForPlus(source, ">ISBN:<")
        source = searchForPlus(source, "<TD>")
        i = string.find(source, "</TD>")
        isbn = stripText(source[0:i])

        i = string.find(isbn, "<STRONG><I>")

        if (i != -1):
            isbn = isbn[i+11:]

        # Find format (only look for paperback)
        i = string.find(isbn, "pbk.")

        if (i != -1):
            format = "Paperback"
        else:
            format = "Hardcover"

        # Find Value (and remove from isbn)
        i = string.find(isbn, ":")

        if (i != -1):
            value = stripText(isbn[i+1:])
            value = stripTelnetCodes(value)
            isbn = stripText(isbn[0:i])

            i = string.find(value, "(")

            if (i != -1):
                value = stripText(value[0:i])

        i = string.find(isbn, "<")

        if (i != -1):
            isbn = isbn[0:i]


    # Find Category
    i = string.find(source, ">Genre/Form:<")

    if (i != -1):
        source = searchForPlus(source, ">Genre/Form:<")
        source = searchForPlus(source, "<TD>")
        source = searchForPlus(source, "HREF=")
        source = searchForPlus(source, ">")
        i = string.find(source, "<")
        category = stripText(source[0:i])
    else:
        i = string.find(source, ">Subjects:<")

        if (i != -1):
            source = searchForPlus(source, ">Subjects:<")
            source = searchForPlus(source, "\">")
            i = string.find(source, "<")
            category = stripText(source[0:i])
            i = string.rfind(category, ".")

            if (i != -1):
                category = stripText(category[0:i])


    # Find Dewey Number
    i = string.find(source, ">Dewey Class No.:<")

    if (i != -1):
        source = searchForPlus(source, ">Dewey Class No.:<")
        source = searchForPlus(source, "<TD>")
        i = string.find(source, "</TD>")
        dewey = stripText(source[0:i])


    # Find Series
    i = string.find(source, "Series:  ")

    if (i != -1):
        source = searchForPlus(source, "Series:  ")
        i = string.find(source, "\n")
        series = stripText(source[0:i])
        source = source[i+1:]

        while (source[0:5] == "     "):
            i = string.find(source, "\n")
            series = series + " " + stripText(source[0:i])
            source = source[i+1:]


    # Find LOC Call Number
    i = string.find(source, ">CALL NUMBER:<")

    if (i != -1):
        source = searchForPlus(source, ">CALL NUMBER:<")
        source = searchForPlus(source, "\">")
        i = string.find(source, "<")
        callnumber = stripText(source[0:i])




try:
    extract()
finally:
    if os.path.exists("scrapers/userexit.py"):
        execfile("scrapers/userexit.py") in globals()
