# British Library (Online Public Access Catalogue, OPAC) scraper
#
# Copyright  1999-2007 Readerware Corporation.  All Rights Reserved.

import  os
import  string

from    com.readerware.http import HTTPConnection

from    scrapers.scrapers import calcISBN10CheckDigit
from    scrapers.scrapers import searchFor
from    scrapers.scrapers import searchForPlus
from    scrapers.scrapers import stripHTML
from    scrapers.scrapers import stripText


def extract():
    global title,author,isbn,publisher,format,first,signed,date,place
    global copies,rating,condition,category,read,pflag,eflag,value
    global comments,dateEntered,dataSource,cart,ordered
    global lccn,dewey,userNumber,copyDate,valueDate,location
    global series,pages,keywords,dimensions
    global user1,user2,user3,user4,user5,user6,user7,user8,user9,user10
    global author2,author3,author4,author5,author6
    global usedprice,usedcount,collectibleprice,collectiblecount
    global newprice,newcount,listprice,readinglevel,salesrank,available
    global buyerwaiting,editionNumber,weight,image
    global fullDateFormat,source


    # Defaults
    first               = "N"
    signed              = "N"
    read                = "N"
    pflag               = "Y"
    eflag               = "Y"
    ordered             = "N"
    usedprice           = ""
    usedcount           = ""
    collectibleprice    = ""
    collectiblecount    = ""
    newprice            = ""
    newcount            = ""
    listprice           = ""
    readinglevel        = ""
    salesrank           = ""
    available           = "Y"
    buyerwaiting        = "N"
    weight              = ""

    # No reliable place extraction default to UK
    # place = "United Kingdom"


    # Find optional fields, pricing info etc.
    marketinfo = source


    # Main extraction
    #
    # British Library highlights search words in the main page, need
    # to remove these highlights as they can appear anywhere, in the
    # middle of a field. This can really mess up the extraction.
    source = string.replace(source, "<span class=text3 id=normalb>", "")
    source = string.replace(source, "</span>", "")

    # Get detail page from search results page
    i = string.find(source, ">Integrated Catalogue - Search Results<")

    if i != -1:
        # Find Match, if any
        i = string.find(source, ">1</A>")

        if i == -1:
            return

        source = stripText(source[i-200:])
        source = searchForPlus(source, "<A HREF=")
        i = string.find(source, ">")
        url = stripText(source[0:i])
        http = HTTPConnection()
        http.resetReferer();
        http.blockForLoad();
        source = http.getContents(url)
        t2 = open("trace2.html", "w")
        t2.write(source)
        t2.close()


    # Find Author
    authorList = []
    i = string.find(source, "Author - personal &nbsp;")

    if i != -1:
        source = searchForPlus(source, "Author - personal &nbsp;")
        i = string.find(source, " Title &nbsp;")
        authors = stripText(source[0:i])

        while (searchFor(authors, "TAG\");'>") != None):
            authors = searchForPlus(authors, "TAG\");'>")
            i = string.find(authors, "</td>")
            author = stripText(authors[0:i])
            author = cleanupData(author)

            i = string.find(author, "<")

            if i != -1:
                author = stripText(author[0:i])

            i = string.find(author, ",&nbsp;")

            if i != -1:
                author = stripText(author[0:i])

            if author.endswith("."):
                author = stripText(author[0:len(author)-1])

            authorList.append(author)

    i = string.find(source, "Added name &nbsp;")

    if i != -1:
        authors = searchForPlus(source, "Added name &nbsp;")
        i = string.find(authors, "Holdings (All) &nbsp;")
        authors = stripText(authors[0:i])

        while (searchFor(authors, "TAG\");'>") != None):
            authors = searchForPlus(authors, "TAG\");'>")
            i = string.find(authors, "</td>")
            author = stripText(authors[0:i])
            author = cleanupData(author)

            i = string.find(author, "<")

            if i != -1:
                author = stripText(author[0:i])

            i = string.find(author, ",&nbsp;")

            if i != -1:
                author = stripText(author[0:i])

            if author.endswith("."):
                author = stripText(author[0:len(author)-1])

            authorList.append(author)

    author = ""

    if len(authorList) > 0:
        author = authorList[0]

    if len(authorList) > 1:
        author2 = authorList[1]

    if len(authorList) > 2:
        author3 = authorList[2]

    if len(authorList) > 3:
        author4 = authorList[3]

    if len(authorList) > 4:
        author5 = authorList[4]

    if len(authorList) > 5:
        author6 = authorList[5]


    # Find Title
    source = searchForPlus(source, " Title &nbsp;")
    source = searchForPlus(source, "<td")
    source = searchForPlus(source, "TAG\");'>")
    i = string.find(source, "</td")
    title = stripHTML(source[0:i])
    title = cleanupData(title)

    if title.endswith("/"):
        i = string.rfind(title, "/")
        title = stripText(title[0:i])

    i = string.find(title, " / ")

    if i != -1:
        title = stripText(title[0:i])

    if title != "":
        if author == "":
            author = "No Author"


    # Find Publisher and date
    i = string.find(source, "Publisher/year &nbsp;")

    if i != -1:
        publisher = ""
        place = ""
        date = ""
        copyDate = ""
        source = searchForPlus(source, "Publisher/year &nbsp;")
        source = searchForPlus(source, "<td")
        source = searchForPlus(source, "TAG\");'>")
        i = string.find(source, "</td")
        publisher = stripText(source[0:i])
        publisher = cleanupData(publisher)

        i = string.find(publisher, ",&nbsp;")
        date = stripText(publisher[i+7:])
        publisher = stripText(publisher[0:i])

        i = string.find(publisher, ":&nbsp;")

        if i != -1:
            place = stripText(publisher[0:i])
            publisher = stripText(publisher[i+7:])

        i = string.find(publisher, ";&nbsp;")

        if i != -1:
            publisher = stripText(publisher[0:i])

        i = string.find(place, ";&nbsp;")

        if i != -1:
            place = stripText(place[0:i])
        

        if date.endswith("."):
            i = string.rfind(date, ".")
            date = stripText(date[0:i])

        i = string.find(date, ", ")

        if i != -1:
            copyDate = stripText(date[i+1:])
            date = stripText(date[0:i])

            if copyDate.startswith("c"):
                copyDate = stripText(copyDate[1:])

        i = string.find(date, "(")

        if i != -1:
            date = stripText(date[0:i])

        if date.startswith("c") == 1 and copyDate == "":
            date = stripText(date[1:])
            copyDate = date


    # Find Pages, Dimensions and format
    i = string.find(source, "Physical descr. &nbsp;")

    if i != -1:
        source = searchForPlus(source, "Physical descr. &nbsp;")
        source = searchForPlus(source, "<td")
        source = searchForPlus(source, ">")
        i = string.find(source, "</td")
        pages = stripText(source[0:i])
        pages = cleanupData(pages)

        i = string.rfind(pages, "pbk")

        if i != -1:
            format = "Paperback"
        else:
            format = "Hardcover"

        i = string.find(pages, "cm")

        if i != -1:
            i = string.find(pages, "p")

            if i != -1:
                dimensions = stripText(pages[i+1:])
            else:
                dimensions = pages

            i = string.find(dimensions, "cm")
            dimensions = stripText(dimensions[0:i+2])

            i = string.find(dimensions, " ; ")

            if i != -1:
                dimensions = stripText(dimensions[i+1:])

            if dimensions.startswith("."):
                dimensions = stripText(dimensions[1:])

            if dimensions.startswith(";"):
                dimensions = stripText(dimensions[1:])

        i = string.find(pages, "p")

        if i != -1:
            pages = stripText(pages[0:i])
        else:
            pages = ""

        i = string.rfind(pages, ",")

        if i != -1:
            pages = stripText(pages[i+1:])


    # Find Comments
    comments = ""
    i = string.find(source, "General note &nbsp;")

    if i != -1:
        comment = searchForPlus(source, "General note &nbsp;")
        comment = searchForPlus(comment, "<td")
        comment = searchForPlus(comment, ">")
        i = string.find(comment, "</td")
        comment = stripHTML(comment[0:i])
        comment = cleanupData(comment)
        comments = comment

    i = string.find(source, "Contents &nbsp;")

    if i != -1:
        comment = searchForPlus(source, "Contents &nbsp;")
        comment = searchForPlus(comment, "<td")
        comment = searchForPlus(comment, ">")
        i = string.find(comment, "</td")
        comment = stripHTML(comment[0:i])
        comment = cleanupData(comment)

        if comments == "":
            comments = comment
        else:            
            comments = comments + "\n\n" + comment


    # Find Series
    i = string.find(source, "Series &nbsp;")

    if i != -1:
        source = searchForPlus(source, "Series &nbsp;")
        source = searchForPlus(source, "<td")
        source = searchForPlus(source, ">")
        i = string.find(source, "</td")
        series = stripText(source[0:i])
        i = string.find(series, "TAG\");'>")

        if i != -1:
            series = searchForPlus(series, "TAG\");'>")
            i = string.find(series, "<")
            series = stripText(series[0:i])

        series = cleanupData(series)


    # Find Category
    i = string.find(source, "Subject &nbsp;")

    if i != -1:
        source = searchForPlus(source, "Subject &nbsp;")
        source = searchForPlus(source, "<td")
        source = searchForPlus(source, "TAG\");'>")
        i = string.find(source, "</td")
        category = stripHTML(source[0:i])
        category = cleanupData(category)


    # Find ISBN
    i = string.find(source, "ISBN &nbsp;")

    if i != -1:
        source = searchForPlus(source, "ISBN &nbsp;")
        i = string.find(source, "<!-- filename: full-set-tail -->")
        isbn = stripText(source[0:i])

        i = string.find(isbn, "Dewey class. no. &nbsp;")

        if i != -1:
            isbn = stripText(isbn[0:i])

        i = string.find(isbn, "<span class=match>")

        if i != -1:
            # BL can list multiple ISBNs. If one is highlighted, then
            # that is the one we searched for, take it.
            isbn = searchForPlus(isbn, "<span class=match>")
            i = string.find(isbn, "</td>")
            isbn = stripText(isbn[0:i])
            isbn = cleanupData(isbn)
        else:
            # No highlighted ISBN, take first.
            isbn = searchForPlus(isbn, "<td")
            isbn = searchForPlus(isbn, "TAG\");'>")
            i = string.find(isbn, "</td")
            isbn = stripText(isbn[0:i])
            isbn = cleanupData(isbn)

        i = string.find(isbn, "pbk")

        if i != -1:
            format = "Paperback"

        i = string.find(isbn, " ")

        if i != -1:
            isbn = stripText(isbn[0:i])

        i = string.find(isbn, "<")

        if i != -1:
            isbn = stripText(isbn[0:i])

        if isbn.startswith("978") == 1:
            isbn = isbn[3:]
            isbn = stripText(isbn[0:len(isbn)-1])
            isbn = calcISBN10CheckDigit(isbn)


    # Find Dewey
    i = string.find(source, "Dewey class. no. &nbsp;")

    if i != -1:
        source = searchForPlus(source, "Dewey class. no. &nbsp;")
        source = searchForPlus(source, "<td")
        source = searchForPlus(source, "TAG\");'>")
        i = string.find(source, "<")
        dewey = stripText(source[0:i])
        dewey = cleanupData(dewey)



def cleanupData(str):
    # British library is highlighting search terms within the field.
    # Call this after getting data to remove highlighting.
    while (searchFor(str, "<span") != None):
        i = string.find(str, "<span")
        j = string.find(str[i:], ">")
        str = str[0:i] + str[i+j+1:]

    str = string.replace(str, "</span>", "")
    str = string.replace(str, "</A>", "")
    str = string.replace(str, "&rsquo;", "'")
    str = string.replace(str, "\xC4\x93", "")
    str = string.replace(str, "\xC5\x8D", "")
    str = string.replace(str, "c\xCC\xA7", "")
    str = string.replace(str, "C\xCC\xA7", "")
    str = string.replace(str, "a\xCC\x80", "")
    str = string.replace(str, "A\xCC\x80", "")
    str = string.replace(str, "e\xCC\x80", "")
    str = string.replace(str, "E\xCC\x80", "")
    str = string.replace(str, "i\xCC\x80", "")
    str = string.replace(str, "I\xCC\x80", "")
    str = string.replace(str, "o\xCC\x80", "")
    str = string.replace(str, "O\xCC\x80", "")
    str = string.replace(str, "u\xCC\x80", "")
    str = string.replace(str, "U\xCC\x80", "")
    str = string.replace(str, "a\xCC\x81", "")
    str = string.replace(str, "A\xCC\x81", "")
    str = string.replace(str, "e\xCC\x81", "")
    str = string.replace(str, "E\xCC\x81", "")
    str = string.replace(str, "i\xCC\x81", "")
    str = string.replace(str, "I\xCC\x81", "")
    str = string.replace(str, "o\xCC\x81", "")
    str = string.replace(str, "O\xCC\x81", "")
    str = string.replace(str, "u\xCC\x81", "")
    str = string.replace(str, "U\xCC\x81", "")
    str = string.replace(str, "a\xCC\x83", "")
    str = string.replace(str, "A\xCC\x83", "")
    str = string.replace(str, "u\xCC\x88", "")
    str = string.replace(str, "U\xCC\x88", "")
    return str    



try:
    extract()
finally:
    if os.path.exists("scrapers/userexit.py"):
        execfile("scrapers/userexit.py") in globals()
