jay.scot

       feedparser.py - gamingskill - A Linux gaming news skill for Amazon Alexa, so I could get monthly AWS credits.
 (HTM) git clone git://jay.scot/gamingskill
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
       ---
       feedparser.py (160057B)
       ---
            1 """Universal feed parser
            2 
            3 Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds
            4 
            5 Visit https://code.google.com/p/feedparser/ for the latest version
            6 Visit http://packages.python.org/feedparser/ for the latest documentation
            7 
            8 Required: Python 2.4 or later
            9 Recommended: iconv_codec <http://cjkpython.i18n.org/>
           10 """
           11 
           12 __version__ = "5.2.1"
           13 __license__ = """
           14 Copyright 2010-2015 Kurt McKee <contactme@kurtmckee.org>
           15 Copyright 2002-2008 Mark Pilgrim
           16 All rights reserved.
           17 
           18 Redistribution and use in source and binary forms, with or without modification,
           19 are permitted provided that the following conditions are met:
           20 
           21 * Redistributions of source code must retain the above copyright notice,
           22   this list of conditions and the following disclaimer.
           23 * Redistributions in binary form must reproduce the above copyright notice,
           24   this list of conditions and the following disclaimer in the documentation
           25   and/or other materials provided with the distribution.
           26 
           27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
           28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
           29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
           30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
           31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
           32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
           33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
           34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
           35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
           36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
           37 POSSIBILITY OF SUCH DAMAGE."""
           38 __author__ = "Mark Pilgrim <http://diveintomark.org/>"
           39 __contributors__ = ["Jason Diamond <http://injektilo.org/>",
           40                     "John Beimler <http://john.beimler.org/>",
           41                     "Fazal Majid <http://www.majid.info/mylos/weblog/>",
           42                     "Aaron Swartz <http://aaronsw.com/>",
           43                     "Kevin Marks <http://epeus.blogspot.com/>",
           44                     "Sam Ruby <http://intertwingly.net/>",
           45                     "Ade Oshineye <http://blog.oshineye.com/>",
           46                     "Martin Pool <http://sourcefrog.net/>",
           47                     "Kurt McKee <http://kurtmckee.org/>",
           48                     "Bernd Schlapsi <https://github.com/brot>",]
           49 
           50 # HTTP "User-Agent" header to send to servers when downloading feeds.
           51 # If you are embedding feedparser in a larger application, you should
           52 # change this to your application name and URL.
           53 USER_AGENT = "UniversalFeedParser/%s +https://code.google.com/p/feedparser/" % __version__
           54 
           55 # HTTP "Accept" header to send to servers when downloading feeds.  If you don't
           56 # want to send an Accept header, set this to None.
           57 ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
           58 
           59 # List of preferred XML parsers, by SAX driver name.  These will be tried first,
           60 # but if they're not installed, Python will keep searching through its own list
           61 # of pre-installed parsers until it finds one that supports everything we need.
           62 PREFERRED_XML_PARSERS = ["drv_libxml2"]
           63 
           64 # If you want feedparser to automatically resolve all relative URIs, set this
           65 # to 1.
           66 RESOLVE_RELATIVE_URIS = 1
           67 
           68 # If you want feedparser to automatically sanitize all potentially unsafe
           69 # HTML content, set this to 1.
           70 SANITIZE_HTML = 1
           71 
           72 # ---------- Python 3 modules (make it work if possible) ----------
           73 try:
           74     import rfc822
           75 except ImportError:
           76     from email import _parseaddr as rfc822
           77 
           78 try:
           79     # Python 3.1 introduces bytes.maketrans and simultaneously
           80     # deprecates string.maketrans; use bytes.maketrans if possible
           81     _maketrans = bytes.maketrans
           82 except (NameError, AttributeError):
           83     import string
           84     _maketrans = string.maketrans
           85 
           86 # base64 support for Atom feeds that contain embedded binary data
           87 try:
           88     import base64, binascii
           89 except ImportError:
           90     base64 = binascii = None
           91 else:
           92     # Python 3.1 deprecates decodestring in favor of decodebytes
           93     _base64decode = getattr(base64, 'decodebytes', base64.decodestring)
           94 
           95 # _s2bytes: convert a UTF-8 str to bytes if the interpreter is Python 3
           96 # _l2bytes: convert a list of ints to bytes if the interpreter is Python 3
           97 try:
           98     if bytes is str:
           99         # In Python 2.5 and below, bytes doesn't exist (NameError)
          100         # In Python 2.6 and above, bytes and str are the same type
          101         raise NameError
          102 except NameError:
          103     # Python 2
          104     def _s2bytes(s):
          105         return s
          106     def _l2bytes(l):
          107         return ''.join(map(chr, l))
          108 else:
          109     # Python 3
          110     def _s2bytes(s):
          111         return bytes(s, 'utf8')
          112     def _l2bytes(l):
          113         return bytes(l)
          114 
          115 # If you want feedparser to allow all URL schemes, set this to ()
          116 # List culled from Python's urlparse documentation at:
          117 #   http://docs.python.org/library/urlparse.html
          118 # as well as from "URI scheme" at Wikipedia:
          119 #   https://secure.wikimedia.org/wikipedia/en/wiki/URI_scheme
          120 # Many more will likely need to be added!
          121 ACCEPTABLE_URI_SCHEMES = (
          122     'file', 'ftp', 'gopher', 'h323', 'hdl', 'http', 'https', 'imap', 'magnet',
          123     'mailto', 'mms', 'news', 'nntp', 'prospero', 'rsync', 'rtsp', 'rtspu',
          124     'sftp', 'shttp', 'sip', 'sips', 'snews', 'svn', 'svn+ssh', 'telnet',
          125     'wais',
          126     # Additional common-but-unofficial schemes
          127     'aim', 'callto', 'cvs', 'facetime', 'feed', 'git', 'gtalk', 'irc', 'ircs',
          128     'irc6', 'itms', 'mms', 'msnim', 'skype', 'ssh', 'smb', 'svn', 'ymsg',
          129 )
          130 #ACCEPTABLE_URI_SCHEMES = ()
          131 
          132 # ---------- required modules (should come with any Python distribution) ----------
          133 import cgi
          134 import codecs
          135 import copy
          136 import datetime
          137 import itertools
          138 import re
          139 import struct
          140 import time
          141 import types
          142 import urllib
          143 import urllib2
          144 import urlparse
          145 import warnings
          146 
          147 from htmlentitydefs import name2codepoint, codepoint2name, entitydefs
          148 
          149 try:
          150     from io import BytesIO as _StringIO
          151 except ImportError:
          152     try:
          153         from cStringIO import StringIO as _StringIO
          154     except ImportError:
          155         from StringIO import StringIO as _StringIO
          156 
          157 # ---------- optional modules (feedparser will work without these, but with reduced functionality) ----------
          158 
          159 # gzip is included with most Python distributions, but may not be available if you compiled your own
          160 try:
          161     import gzip
          162 except ImportError:
          163     gzip = None
          164 try:
          165     import zlib
          166 except ImportError:
          167     zlib = None
          168 
          169 # If a real XML parser is available, feedparser will attempt to use it.  feedparser has
          170 # been tested with the built-in SAX parser and libxml2.  On platforms where the
          171 # Python distribution does not come with an XML parser (such as Mac OS X 10.2 and some
          172 # versions of FreeBSD), feedparser will quietly fall back on regex-based parsing.
          173 try:
          174     import xml.sax
          175     from xml.sax.saxutils import escape as _xmlescape
          176 except ImportError:
          177     _XML_AVAILABLE = 0
          178     def _xmlescape(data,entities={}):
          179         data = data.replace('&', '&amp;')
          180         data = data.replace('>', '&gt;')
          181         data = data.replace('<', '&lt;')
          182         for char, entity in entities:
          183             data = data.replace(char, entity)
          184         return data
          185 else:
          186     try:
          187         xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers
          188     except xml.sax.SAXReaderNotAvailable:
          189         _XML_AVAILABLE = 0
          190     else:
          191         _XML_AVAILABLE = 1
          192 
          193 # sgmllib is not available by default in Python 3; if the end user doesn't have
          194 # it available then we'll lose illformed XML parsing and content santizing
          195 try:
          196     import sgmllib
          197 except ImportError:
          198     # This is probably Python 3, which doesn't include sgmllib anymore
          199     _SGML_AVAILABLE = 0
          200 
          201     # Mock sgmllib enough to allow subclassing later on
          202     class sgmllib(object):
          203         class SGMLParser(object):
          204             def goahead(self, i):
          205                 pass
          206             def parse_starttag(self, i):
          207                 pass
          208 else:
          209     _SGML_AVAILABLE = 1
          210 
          211     # sgmllib defines a number of module-level regular expressions that are
          212     # insufficient for the XML parsing feedparser needs. Rather than modify
          213     # the variables directly in sgmllib, they're defined here using the same
          214     # names, and the compiled code objects of several sgmllib.SGMLParser
          215     # methods are copied into _BaseHTMLProcessor so that they execute in
          216     # feedparser's scope instead of sgmllib's scope.
          217     charref = re.compile('&#(\d+|[xX][0-9a-fA-F]+);')
          218     tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
          219     attrfind = re.compile(
          220         r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)[$]?(\s*=\s*'
          221         r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?'
          222     )
          223 
          224     # Unfortunately, these must be copied over to prevent NameError exceptions
          225     entityref = sgmllib.entityref
          226     incomplete = sgmllib.incomplete
          227     interesting = sgmllib.interesting
          228     shorttag = sgmllib.shorttag
          229     shorttagopen = sgmllib.shorttagopen
          230     starttagopen = sgmllib.starttagopen
          231 
          232     class _EndBracketRegEx:
          233         def __init__(self):
          234             # Overriding the built-in sgmllib.endbracket regex allows the
          235             # parser to find angle brackets embedded in element attributes.
          236             self.endbracket = re.compile('''([^'"<>]|"[^"]*"(?=>|/|\s|\w+=)|'[^']*'(?=>|/|\s|\w+=))*(?=[<>])|.*?(?=[<>])''')
          237         def search(self, target, index=0):
          238             match = self.endbracket.match(target, index)
          239             if match is not None:
          240                 # Returning a new object in the calling thread's context
          241                 # resolves a thread-safety.
          242                 return EndBracketMatch(match)
          243             return None
          244     class EndBracketMatch:
          245         def __init__(self, match):
          246             self.match = match
          247         def start(self, n):
          248             return self.match.end(n)
          249     endbracket = _EndBracketRegEx()
          250 
          251 
          252 # iconv_codec provides support for more character encodings.
          253 # It's available from http://cjkpython.i18n.org/
          254 try:
          255     import iconv_codec
          256 except ImportError:
          257     pass
          258 
          259 # chardet library auto-detects character encodings
          260 # Download from http://chardet.feedparser.org/
          261 try:
          262     import chardet
          263 except ImportError:
          264     chardet = None
          265 
          266 # ---------- don't touch these ----------
          267 class ThingsNobodyCaresAboutButMe(Exception): pass
          268 class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass
          269 class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): pass
          270 class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass
          271 class UndeclaredNamespace(Exception): pass
          272 
          273 SUPPORTED_VERSIONS = {'': u'unknown',
          274                       'rss090': u'RSS 0.90',
          275                       'rss091n': u'RSS 0.91 (Netscape)',
          276                       'rss091u': u'RSS 0.91 (Userland)',
          277                       'rss092': u'RSS 0.92',
          278                       'rss093': u'RSS 0.93',
          279                       'rss094': u'RSS 0.94',
          280                       'rss20': u'RSS 2.0',
          281                       'rss10': u'RSS 1.0',
          282                       'rss': u'RSS (unknown version)',
          283                       'atom01': u'Atom 0.1',
          284                       'atom02': u'Atom 0.2',
          285                       'atom03': u'Atom 0.3',
          286                       'atom10': u'Atom 1.0',
          287                       'atom': u'Atom (unknown version)',
          288                       'cdf': u'CDF',
          289                       }
          290 
          291 class FeedParserDict(dict):
          292     keymap = {'channel': 'feed',
          293               'items': 'entries',
          294               'guid': 'id',
          295               'date': 'updated',
          296               'date_parsed': 'updated_parsed',
          297               'description': ['summary', 'subtitle'],
          298               'description_detail': ['summary_detail', 'subtitle_detail'],
          299               'url': ['href'],
          300               'modified': 'updated',
          301               'modified_parsed': 'updated_parsed',
          302               'issued': 'published',
          303               'issued_parsed': 'published_parsed',
          304               'copyright': 'rights',
          305               'copyright_detail': 'rights_detail',
          306               'tagline': 'subtitle',
          307               'tagline_detail': 'subtitle_detail'}
          308     def __getitem__(self, key):
          309         '''
          310         :return: A :class:`FeedParserDict`.
          311         '''
          312         if key == 'category':
          313             try:
          314                 return dict.__getitem__(self, 'tags')[0]['term']
          315             except IndexError:
          316                 raise KeyError, "object doesn't have key 'category'"
          317         elif key == 'enclosures':
          318             norel = lambda link: FeedParserDict([(name,value) for (name,value) in link.items() if name!='rel'])
          319             return [norel(link) for link in dict.__getitem__(self, 'links') if link['rel']==u'enclosure']
          320         elif key == 'license':
          321             for link in dict.__getitem__(self, 'links'):
          322                 if link['rel']==u'license' and 'href' in link:
          323                     return link['href']
          324         elif key == 'updated':
          325             # Temporarily help developers out by keeping the old
          326             # broken behavior that was reported in issue 310.
          327             # This fix was proposed in issue 328.
          328             if not dict.__contains__(self, 'updated') and \
          329                 dict.__contains__(self, 'published'):
          330                 warnings.warn("To avoid breaking existing software while "
          331                     "fixing issue 310, a temporary mapping has been created "
          332                     "from `updated` to `published` if `updated` doesn't "
          333                     "exist. This fallback will be removed in a future version "
          334                     "of feedparser.", DeprecationWarning)
          335                 return dict.__getitem__(self, 'published')
          336             return dict.__getitem__(self, 'updated')
          337         elif key == 'updated_parsed':
          338             if not dict.__contains__(self, 'updated_parsed') and \
          339                 dict.__contains__(self, 'published_parsed'):
          340                 warnings.warn("To avoid breaking existing software while "
          341                     "fixing issue 310, a temporary mapping has been created "
          342                     "from `updated_parsed` to `published_parsed` if "
          343                     "`updated_parsed` doesn't exist. This fallback will be "
          344                     "removed in a future version of feedparser.",
          345                     DeprecationWarning)
          346                 return dict.__getitem__(self, 'published_parsed')
          347             return dict.__getitem__(self, 'updated_parsed')
          348         else:
          349             realkey = self.keymap.get(key, key)
          350             if isinstance(realkey, list):
          351                 for k in realkey:
          352                     if dict.__contains__(self, k):
          353                         return dict.__getitem__(self, k)
          354             elif dict.__contains__(self, realkey):
          355                 return dict.__getitem__(self, realkey)
          356         return dict.__getitem__(self, key)
          357 
          358     def __contains__(self, key):
          359         if key in ('updated', 'updated_parsed'):
          360             # Temporarily help developers out by keeping the old
          361             # broken behavior that was reported in issue 310.
          362             # This fix was proposed in issue 328.
          363             return dict.__contains__(self, key)
          364         try:
          365             self.__getitem__(key)
          366         except KeyError:
          367             return False
          368         else:
          369             return True
          370 
          371     has_key = __contains__
          372 
          373     def get(self, key, default=None):
          374         '''
          375         :return: A :class:`FeedParserDict`.
          376         '''
          377         try:
          378             return self.__getitem__(key)
          379         except KeyError:
          380             return default
          381 
          382     def __setitem__(self, key, value):
          383         key = self.keymap.get(key, key)
          384         if isinstance(key, list):
          385             key = key[0]
          386         return dict.__setitem__(self, key, value)
          387 
          388     def setdefault(self, key, value):
          389         if key not in self:
          390             self[key] = value
          391             return value
          392         return self[key]
          393 
          394     def __getattr__(self, key):
          395         # __getattribute__() is called first; this will be called
          396         # only if an attribute was not already found
          397         try:
          398             return self.__getitem__(key)
          399         except KeyError:
          400             raise AttributeError, "object has no attribute '%s'" % key
          401 
          402     def __hash__(self):
          403         return id(self)
          404 
          405 _cp1252 = {
          406     128: unichr(8364), # euro sign
          407     130: unichr(8218), # single low-9 quotation mark
          408     131: unichr( 402), # latin small letter f with hook
          409     132: unichr(8222), # double low-9 quotation mark
          410     133: unichr(8230), # horizontal ellipsis
          411     134: unichr(8224), # dagger
          412     135: unichr(8225), # double dagger
          413     136: unichr( 710), # modifier letter circumflex accent
          414     137: unichr(8240), # per mille sign
          415     138: unichr( 352), # latin capital letter s with caron
          416     139: unichr(8249), # single left-pointing angle quotation mark
          417     140: unichr( 338), # latin capital ligature oe
          418     142: unichr( 381), # latin capital letter z with caron
          419     145: unichr(8216), # left single quotation mark
          420     146: unichr(8217), # right single quotation mark
          421     147: unichr(8220), # left double quotation mark
          422     148: unichr(8221), # right double quotation mark
          423     149: unichr(8226), # bullet
          424     150: unichr(8211), # en dash
          425     151: unichr(8212), # em dash
          426     152: unichr( 732), # small tilde
          427     153: unichr(8482), # trade mark sign
          428     154: unichr( 353), # latin small letter s with caron
          429     155: unichr(8250), # single right-pointing angle quotation mark
          430     156: unichr( 339), # latin small ligature oe
          431     158: unichr( 382), # latin small letter z with caron
          432     159: unichr( 376), # latin capital letter y with diaeresis
          433 }
          434 
          435 _urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)')
          436 def _urljoin(base, uri):
          437     uri = _urifixer.sub(r'\1\3', uri)
          438     if not isinstance(uri, unicode):
          439         uri = uri.decode('utf-8', 'ignore')
          440     try:
          441         uri = urlparse.urljoin(base, uri)
          442     except ValueError:
          443         uri = u''
          444     if not isinstance(uri, unicode):
          445         return uri.decode('utf-8', 'ignore')
          446     return uri
          447 
          448 class _FeedParserMixin:
          449     namespaces = {
          450         '': '',
          451         'http://backend.userland.com/rss': '',
          452         'http://blogs.law.harvard.edu/tech/rss': '',
          453         'http://purl.org/rss/1.0/': '',
          454         'http://my.netscape.com/rdf/simple/0.9/': '',
          455         'http://example.com/newformat#': '',
          456         'http://example.com/necho': '',
          457         'http://purl.org/echo/': '',
          458         'uri/of/echo/namespace#': '',
          459         'http://purl.org/pie/': '',
          460         'http://purl.org/atom/ns#': '',
          461         'http://www.w3.org/2005/Atom': '',
          462         'http://purl.org/rss/1.0/modules/rss091#': '',
          463 
          464         'http://webns.net/mvcb/':                                'admin',
          465         'http://purl.org/rss/1.0/modules/aggregation/':          'ag',
          466         'http://purl.org/rss/1.0/modules/annotate/':             'annotate',
          467         'http://media.tangent.org/rss/1.0/':                     'audio',
          468         'http://backend.userland.com/blogChannelModule':         'blogChannel',
          469         'http://web.resource.org/cc/':                           'cc',
          470         'http://backend.userland.com/creativeCommonsRssModule':  'creativeCommons',
          471         'http://purl.org/rss/1.0/modules/company':               'co',
          472         'http://purl.org/rss/1.0/modules/content/':              'content',
          473         'http://my.theinfo.org/changed/1.0/rss/':                'cp',
          474         'http://purl.org/dc/elements/1.1/':                      'dc',
          475         'http://purl.org/dc/terms/':                             'dcterms',
          476         'http://purl.org/rss/1.0/modules/email/':                'email',
          477         'http://purl.org/rss/1.0/modules/event/':                'ev',
          478         'http://rssnamespace.org/feedburner/ext/1.0':            'feedburner',
          479         'http://freshmeat.net/rss/fm/':                          'fm',
          480         'http://xmlns.com/foaf/0.1/':                            'foaf',
          481         'http://www.w3.org/2003/01/geo/wgs84_pos#':              'geo',
          482         'http://www.georss.org/georss':                          'georss',
          483         'http://www.opengis.net/gml':                            'gml',
          484         'http://postneo.com/icbm/':                              'icbm',
          485         'http://purl.org/rss/1.0/modules/image/':                'image',
          486         'http://www.itunes.com/DTDs/PodCast-1.0.dtd':            'itunes',
          487         'http://example.com/DTDs/PodCast-1.0.dtd':               'itunes',
          488         'http://purl.org/rss/1.0/modules/link/':                 'l',
          489         'http://search.yahoo.com/mrss':                          'media',
          490         # Version 1.1.2 of the Media RSS spec added the trailing slash on the namespace
          491         'http://search.yahoo.com/mrss/':                         'media',
          492         'http://madskills.com/public/xml/rss/module/pingback/':  'pingback',
          493         'http://prismstandard.org/namespaces/1.2/basic/':        'prism',
          494         'http://www.w3.org/1999/02/22-rdf-syntax-ns#':           'rdf',
          495         'http://www.w3.org/2000/01/rdf-schema#':                 'rdfs',
          496         'http://purl.org/rss/1.0/modules/reference/':            'ref',
          497         'http://purl.org/rss/1.0/modules/richequiv/':            'reqv',
          498         'http://purl.org/rss/1.0/modules/search/':               'search',
          499         'http://purl.org/rss/1.0/modules/slash/':                'slash',
          500         'http://schemas.xmlsoap.org/soap/envelope/':             'soap',
          501         'http://purl.org/rss/1.0/modules/servicestatus/':        'ss',
          502         'http://hacks.benhammersley.com/rss/streaming/':         'str',
          503         'http://purl.org/rss/1.0/modules/subscription/':         'sub',
          504         'http://purl.org/rss/1.0/modules/syndication/':          'sy',
          505         'http://schemas.pocketsoap.com/rss/myDescModule/':       'szf',
          506         'http://purl.org/rss/1.0/modules/taxonomy/':             'taxo',
          507         'http://purl.org/rss/1.0/modules/threading/':            'thr',
          508         'http://purl.org/rss/1.0/modules/textinput/':            'ti',
          509         'http://madskills.com/public/xml/rss/module/trackback/': 'trackback',
          510         'http://wellformedweb.org/commentAPI/':                  'wfw',
          511         'http://purl.org/rss/1.0/modules/wiki/':                 'wiki',
          512         'http://www.w3.org/1999/xhtml':                          'xhtml',
          513         'http://www.w3.org/1999/xlink':                          'xlink',
          514         'http://www.w3.org/XML/1998/namespace':                  'xml',
          515         'http://podlove.org/simple-chapters':                    'psc',
          516     }
          517     _matchnamespaces = {}
          518 
          519     can_be_relative_uri = set(['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'icon', 'logo'])
          520     can_contain_relative_uris = set(['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description'])
          521     can_contain_dangerous_markup = set(['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description'])
          522     html_types = [u'text/html', u'application/xhtml+xml']
          523 
          524     def __init__(self, baseuri=None, baselang=None, encoding=u'utf-8'):
          525         if not self._matchnamespaces:
          526             for k, v in self.namespaces.items():
          527                 self._matchnamespaces[k.lower()] = v
          528         self.feeddata = FeedParserDict() # feed-level data
          529         self.encoding = encoding # character encoding
          530         self.entries = [] # list of entry-level data
          531         self.version = u'' # feed type/version, see SUPPORTED_VERSIONS
          532         self.namespacesInUse = {} # dictionary of namespaces defined by the feed
          533 
          534         # the following are used internally to track state;
          535         # this is really out of control and should be refactored
          536         self.infeed = 0
          537         self.inentry = 0
          538         self.incontent = 0
          539         self.intextinput = 0
          540         self.inimage = 0
          541         self.inauthor = 0
          542         self.incontributor = 0
          543         self.inpublisher = 0
          544         self.insource = 0
          545 
          546         # georss
          547         self.ingeometry = 0
          548 
          549         self.sourcedata = FeedParserDict()
          550         self.contentparams = FeedParserDict()
          551         self._summaryKey = None
          552         self.namespacemap = {}
          553         self.elementstack = []
          554         self.basestack = []
          555         self.langstack = []
          556         self.baseuri = baseuri or u''
          557         self.lang = baselang or None
          558         self.svgOK = 0
          559         self.title_depth = -1
          560         self.depth = 0
          561         # psc_chapters_flag prevents multiple psc_chapters from being
          562         # captured in a single entry or item. The transition states are
          563         # None -> True -> False. psc_chapter elements will only be
          564         # captured while it is True.
          565         self.psc_chapters_flag = None
          566         if baselang:
          567             self.feeddata['language'] = baselang.replace('_','-')
          568 
          569         # A map of the following form:
          570         #     {
          571         #         object_that_value_is_set_on: {
          572         #             property_name: depth_of_node_property_was_extracted_from,
          573         #             other_property: depth_of_node_property_was_extracted_from,
          574         #         },
          575         #     }
          576         self.property_depth_map = {}
          577 
          578     def _normalize_attributes(self, kv):
          579         k = kv[0].lower()
          580         v = k in ('rel', 'type') and kv[1].lower() or kv[1]
          581         # the sgml parser doesn't handle entities in attributes, nor
          582         # does it pass the attribute values through as unicode, while
          583         # strict xml parsers do -- account for this difference
          584         if isinstance(self, _LooseFeedParser):
          585             v = v.replace('&amp;', '&')
          586             if not isinstance(v, unicode):
          587                 v = v.decode('utf-8')
          588         return (k, v)
          589 
          590     def unknown_starttag(self, tag, attrs):
          591         # increment depth counter
          592         self.depth += 1
          593 
          594         # normalize attrs
          595         attrs = map(self._normalize_attributes, attrs)
          596 
          597         # track xml:base and xml:lang
          598         attrsD = dict(attrs)
          599         baseuri = attrsD.get('xml:base', attrsD.get('base')) or self.baseuri
          600         if not isinstance(baseuri, unicode):
          601             baseuri = baseuri.decode(self.encoding, 'ignore')
          602         # ensure that self.baseuri is always an absolute URI that
          603         # uses a whitelisted URI scheme (e.g. not `javscript:`)
          604         if self.baseuri:
          605             self.baseuri = _makeSafeAbsoluteURI(self.baseuri, baseuri) or self.baseuri
          606         else:
          607             self.baseuri = _urljoin(self.baseuri, baseuri)
          608         lang = attrsD.get('xml:lang', attrsD.get('lang'))
          609         if lang == '':
          610             # xml:lang could be explicitly set to '', we need to capture that
          611             lang = None
          612         elif lang is None:
          613             # if no xml:lang is specified, use parent lang
          614             lang = self.lang
          615         if lang:
          616             if tag in ('feed', 'rss', 'rdf:RDF'):
          617                 self.feeddata['language'] = lang.replace('_','-')
          618         self.lang = lang
          619         self.basestack.append(self.baseuri)
          620         self.langstack.append(lang)
          621 
          622         # track namespaces
          623         for prefix, uri in attrs:
          624             if prefix.startswith('xmlns:'):
          625                 self.trackNamespace(prefix[6:], uri)
          626             elif prefix == 'xmlns':
          627                 self.trackNamespace(None, uri)
          628 
          629         # track inline content
          630         if self.incontent and not self.contentparams.get('type', u'xml').endswith(u'xml'):
          631             if tag in ('xhtml:div', 'div'):
          632                 return # typepad does this 10/2007
          633             # element declared itself as escaped markup, but it isn't really
          634             self.contentparams['type'] = u'application/xhtml+xml'
          635         if self.incontent and self.contentparams.get('type') == u'application/xhtml+xml':
          636             if tag.find(':') <> -1:
          637                 prefix, tag = tag.split(':', 1)
          638                 namespace = self.namespacesInUse.get(prefix, '')
          639                 if tag=='math' and namespace=='http://www.w3.org/1998/Math/MathML':
          640                     attrs.append(('xmlns',namespace))
          641                 if tag=='svg' and namespace=='http://www.w3.org/2000/svg':
          642                     attrs.append(('xmlns',namespace))
          643             if tag == 'svg':
          644                 self.svgOK += 1
          645             return self.handle_data('<%s%s>' % (tag, self.strattrs(attrs)), escape=0)
          646 
          647         # match namespaces
          648         if tag.find(':') <> -1:
          649             prefix, suffix = tag.split(':', 1)
          650         else:
          651             prefix, suffix = '', tag
          652         prefix = self.namespacemap.get(prefix, prefix)
          653         if prefix:
          654             prefix = prefix + '_'
          655 
          656         # special hack for better tracking of empty textinput/image elements in illformed feeds
          657         if (not prefix) and tag not in ('title', 'link', 'description', 'name'):
          658             self.intextinput = 0
          659         if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'href', 'width', 'height'):
          660             self.inimage = 0
          661 
          662         # call special handler (if defined) or default handler
          663         methodname = '_start_' + prefix + suffix
          664         try:
          665             method = getattr(self, methodname)
          666             return method(attrsD)
          667         except AttributeError:
          668             # Since there's no handler or something has gone wrong we explicitly add the element and its attributes
          669             unknown_tag = prefix + suffix
          670             if len(attrsD) == 0:
          671                 # No attributes so merge it into the encosing dictionary
          672                 return self.push(unknown_tag, 1)
          673             else:
          674                 # Has attributes so create it in its own dictionary
          675                 context = self._getContext()
          676                 context[unknown_tag] = attrsD
          677 
          678     def unknown_endtag(self, tag):
          679         # match namespaces
          680         if tag.find(':') <> -1:
          681             prefix, suffix = tag.split(':', 1)
          682         else:
          683             prefix, suffix = '', tag
          684         prefix = self.namespacemap.get(prefix, prefix)
          685         if prefix:
          686             prefix = prefix + '_'
          687         if suffix == 'svg' and self.svgOK:
          688             self.svgOK -= 1
          689 
          690         # call special handler (if defined) or default handler
          691         methodname = '_end_' + prefix + suffix
          692         try:
          693             if self.svgOK:
          694                 raise AttributeError()
          695             method = getattr(self, methodname)
          696             method()
          697         except AttributeError:
          698             self.pop(prefix + suffix)
          699 
          700         # track inline content
          701         if self.incontent and not self.contentparams.get('type', u'xml').endswith(u'xml'):
          702             # element declared itself as escaped markup, but it isn't really
          703             if tag in ('xhtml:div', 'div'):
          704                 return # typepad does this 10/2007
          705             self.contentparams['type'] = u'application/xhtml+xml'
          706         if self.incontent and self.contentparams.get('type') == u'application/xhtml+xml':
          707             tag = tag.split(':')[-1]
          708             self.handle_data('</%s>' % tag, escape=0)
          709 
          710         # track xml:base and xml:lang going out of scope
          711         if self.basestack:
          712             self.basestack.pop()
          713             if self.basestack and self.basestack[-1]:
          714                 self.baseuri = self.basestack[-1]
          715         if self.langstack:
          716             self.langstack.pop()
          717             if self.langstack: # and (self.langstack[-1] is not None):
          718                 self.lang = self.langstack[-1]
          719 
          720         self.depth -= 1
          721 
          722     def handle_charref(self, ref):
          723         # called for each character reference, e.g. for '&#160;', ref will be '160'
          724         if not self.elementstack:
          725             return
          726         ref = ref.lower()
          727         if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'):
          728             text = '&#%s;' % ref
          729         else:
          730             if ref[0] == 'x':
          731                 c = int(ref[1:], 16)
          732             else:
          733                 c = int(ref)
          734             text = unichr(c).encode('utf-8')
          735         self.elementstack[-1][2].append(text)
          736 
          737     def handle_entityref(self, ref):
          738         # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
          739         if not self.elementstack:
          740             return
          741         if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
          742             text = '&%s;' % ref
          743         elif ref in self.entities:
          744             text = self.entities[ref]
          745             if text.startswith('&#') and text.endswith(';'):
          746                 return self.handle_entityref(text)
          747         else:
          748             try:
          749                 name2codepoint[ref]
          750             except KeyError:
          751                 text = '&%s;' % ref
          752             else:
          753                 text = unichr(name2codepoint[ref]).encode('utf-8')
          754         self.elementstack[-1][2].append(text)
          755 
          756     def handle_data(self, text, escape=1):
          757         # called for each block of plain text, i.e. outside of any tag and
          758         # not containing any character or entity references
          759         if not self.elementstack:
          760             return
          761         if escape and self.contentparams.get('type') == u'application/xhtml+xml':
          762             text = _xmlescape(text)
          763         self.elementstack[-1][2].append(text)
          764 
          765     def handle_comment(self, text):
          766         # called for each comment, e.g. <!-- insert message here -->
          767         pass
          768 
          769     def handle_pi(self, text):
          770         # called for each processing instruction, e.g. <?instruction>
          771         pass
          772 
          773     def handle_decl(self, text):
          774         pass
          775 
          776     def parse_declaration(self, i):
          777         # override internal declaration handler to handle CDATA blocks
          778         if self.rawdata[i:i+9] == '<![CDATA[':
          779             k = self.rawdata.find(']]>', i)
          780             if k == -1:
          781                 # CDATA block began but didn't finish
          782                 k = len(self.rawdata)
          783                 return k
          784             self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0)
          785             return k+3
          786         else:
          787             k = self.rawdata.find('>', i)
          788             if k >= 0:
          789                 return k+1
          790             else:
          791                 # We have an incomplete CDATA block.
          792                 return k
          793 
          794     def mapContentType(self, contentType):
          795         contentType = contentType.lower()
          796         if contentType == 'text' or contentType == 'plain':
          797             contentType = u'text/plain'
          798         elif contentType == 'html':
          799             contentType = u'text/html'
          800         elif contentType == 'xhtml':
          801             contentType = u'application/xhtml+xml'
          802         return contentType
          803 
          804     def trackNamespace(self, prefix, uri):
          805         loweruri = uri.lower()
          806         if not self.version:
          807             if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/'):
          808                 self.version = u'rss090'
          809             elif loweruri == 'http://purl.org/rss/1.0/':
          810                 self.version = u'rss10'
          811             elif loweruri == 'http://www.w3.org/2005/atom':
          812                 self.version = u'atom10'
          813         if loweruri.find(u'backend.userland.com/rss') <> -1:
          814             # match any backend.userland.com namespace
          815             uri = u'http://backend.userland.com/rss'
          816             loweruri = uri
          817         if loweruri in self._matchnamespaces:
          818             self.namespacemap[prefix] = self._matchnamespaces[loweruri]
          819             self.namespacesInUse[self._matchnamespaces[loweruri]] = uri
          820         else:
          821             self.namespacesInUse[prefix or ''] = uri
          822 
          823     def resolveURI(self, uri):
          824         return _urljoin(self.baseuri or u'', uri)
          825 
          826     def decodeEntities(self, element, data):
          827         return data
          828 
          829     def strattrs(self, attrs):
          830         return ''.join([' %s="%s"' % (t[0],_xmlescape(t[1],{'"':'&quot;'})) for t in attrs])
          831 
          832     def push(self, element, expectingText):
          833         self.elementstack.append([element, expectingText, []])
          834 
          835     def pop(self, element, stripWhitespace=1):
          836         if not self.elementstack:
          837             return
          838         if self.elementstack[-1][0] != element:
          839             return
          840 
          841         element, expectingText, pieces = self.elementstack.pop()
          842 
          843         if self.version == u'atom10' and self.contentparams.get('type', u'text') == u'application/xhtml+xml':
          844             # remove enclosing child element, but only if it is a <div> and
          845             # only if all the remaining content is nested underneath it.
          846             # This means that the divs would be retained in the following:
          847             #    <div>foo</div><div>bar</div>
          848             while pieces and len(pieces)>1 and not pieces[-1].strip():
          849                 del pieces[-1]
          850             while pieces and len(pieces)>1 and not pieces[0].strip():
          851                 del pieces[0]
          852             if pieces and (pieces[0] == '<div>' or pieces[0].startswith('<div ')) and pieces[-1]=='</div>':
          853                 depth = 0
          854                 for piece in pieces[:-1]:
          855                     if piece.startswith('</'):
          856                         depth -= 1
          857                         if depth == 0:
          858                             break
          859                     elif piece.startswith('<') and not piece.endswith('/>'):
          860                         depth += 1
          861                 else:
          862                     pieces = pieces[1:-1]
          863 
          864         # Ensure each piece is a str for Python 3
          865         for (i, v) in enumerate(pieces):
          866             if not isinstance(v, unicode):
          867                 pieces[i] = v.decode('utf-8')
          868 
          869         output = u''.join(pieces)
          870         if stripWhitespace:
          871             output = output.strip()
          872         if not expectingText:
          873             return output
          874 
          875         # decode base64 content
          876         if base64 and self.contentparams.get('base64', 0):
          877             try:
          878                 output = _base64decode(output)
          879             except binascii.Error:
          880                 pass
          881             except binascii.Incomplete:
          882                 pass
          883             except TypeError:
          884                 # In Python 3, base64 takes and outputs bytes, not str
          885                 # This may not be the most correct way to accomplish this
          886                 output = _base64decode(output.encode('utf-8')).decode('utf-8')
          887 
          888         # resolve relative URIs
          889         if (element in self.can_be_relative_uri) and output:
          890             # do not resolve guid elements with isPermalink="false"
          891             if not element == 'id' or self.guidislink:
          892                 output = self.resolveURI(output)
          893 
          894         # decode entities within embedded markup
          895         if not self.contentparams.get('base64', 0):
          896             output = self.decodeEntities(element, output)
          897 
          898         # some feed formats require consumers to guess
          899         # whether the content is html or plain text
          900         if not self.version.startswith(u'atom') and self.contentparams.get('type') == u'text/plain':
          901             if self.lookslikehtml(output):
          902                 self.contentparams['type'] = u'text/html'
          903 
          904         # remove temporary cruft from contentparams
          905         try:
          906             del self.contentparams['mode']
          907         except KeyError:
          908             pass
          909         try:
          910             del self.contentparams['base64']
          911         except KeyError:
          912             pass
          913 
          914         is_htmlish = self.mapContentType(self.contentparams.get('type', u'text/html')) in self.html_types
          915         # resolve relative URIs within embedded markup
          916         if is_htmlish and RESOLVE_RELATIVE_URIS:
          917             if element in self.can_contain_relative_uris:
          918                 output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', u'text/html'))
          919 
          920         # sanitize embedded markup
          921         if is_htmlish and SANITIZE_HTML:
          922             if element in self.can_contain_dangerous_markup:
          923                 output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', u'text/html'))
          924 
          925         if self.encoding and not isinstance(output, unicode):
          926             output = output.decode(self.encoding, 'ignore')
          927 
          928         # address common error where people take data that is already
          929         # utf-8, presume that it is iso-8859-1, and re-encode it.
          930         if self.encoding in (u'utf-8', u'utf-8_INVALID_PYTHON_3') and isinstance(output, unicode):
          931             try:
          932                 output = output.encode('iso-8859-1').decode('utf-8')
          933             except (UnicodeEncodeError, UnicodeDecodeError):
          934                 pass
          935 
          936         # map win-1252 extensions to the proper code points
          937         if isinstance(output, unicode):
          938             output = output.translate(_cp1252)
          939 
          940         # categories/tags/keywords/whatever are handled in _end_category or _end_tags or _end_itunes_keywords
          941         if element in ('category', 'tags', 'itunes_keywords'):
          942             return output
          943 
          944         if element == 'title' and -1 < self.title_depth <= self.depth:
          945             return output
          946 
          947         # store output in appropriate place(s)
          948         if self.inentry and not self.insource:
          949             if element == 'content':
          950                 self.entries[-1].setdefault(element, [])
          951                 contentparams = copy.deepcopy(self.contentparams)
          952                 contentparams['value'] = output
          953                 self.entries[-1][element].append(contentparams)
          954             elif element == 'link':
          955                 if not self.inimage:
          956                     # query variables in urls in link elements are improperly
          957                     # converted from `?a=1&b=2` to `?a=1&b;=2` as if they're
          958                     # unhandled character references. fix this special case.
          959                     output = output.replace('&amp;', '&')
          960                     output = re.sub("&([A-Za-z0-9_]+);", "&\g<1>", output)
          961                     self.entries[-1][element] = output
          962                     if output:
          963                         self.entries[-1]['links'][-1]['href'] = output
          964             else:
          965                 if element == 'description':
          966                     element = 'summary'
          967                 old_value_depth = self.property_depth_map.setdefault(self.entries[-1], {}).get(element)
          968                 if old_value_depth is None or self.depth <= old_value_depth:
          969                     self.property_depth_map[self.entries[-1]][element] = self.depth
          970                     self.entries[-1][element] = output
          971                 if self.incontent:
          972                     contentparams = copy.deepcopy(self.contentparams)
          973                     contentparams['value'] = output
          974                     self.entries[-1][element + '_detail'] = contentparams
          975         elif (self.infeed or self.insource):# and (not self.intextinput) and (not self.inimage):
          976             context = self._getContext()
          977             if element == 'description':
          978                 element = 'subtitle'
          979             context[element] = output
          980             if element == 'link':
          981                 # fix query variables; see above for the explanation
          982                 output = re.sub("&([A-Za-z0-9_]+);", "&\g<1>", output)
          983                 context[element] = output
          984                 context['links'][-1]['href'] = output
          985             elif self.incontent:
          986                 contentparams = copy.deepcopy(self.contentparams)
          987                 contentparams['value'] = output
          988                 context[element + '_detail'] = contentparams
          989         return output
          990 
          991     def pushContent(self, tag, attrsD, defaultContentType, expectingText):
          992         self.incontent += 1
          993         if self.lang:
          994             self.lang=self.lang.replace('_','-')
          995         self.contentparams = FeedParserDict({
          996             'type': self.mapContentType(attrsD.get('type', defaultContentType)),
          997             'language': self.lang,
          998             'base': self.baseuri})
          999         self.contentparams['base64'] = self._isBase64(attrsD, self.contentparams)
         1000         self.push(tag, expectingText)
         1001 
         1002     def popContent(self, tag):
         1003         value = self.pop(tag)
         1004         self.incontent -= 1
         1005         self.contentparams.clear()
         1006         return value
         1007 
         1008     # a number of elements in a number of RSS variants are nominally plain
         1009     # text, but this is routinely ignored.  This is an attempt to detect
         1010     # the most common cases.  As false positives often result in silent
         1011     # data loss, this function errs on the conservative side.
         1012     @staticmethod
         1013     def lookslikehtml(s):
         1014         # must have a close tag or an entity reference to qualify
         1015         if not (re.search(r'</(\w+)>',s) or re.search("&#?\w+;",s)):
         1016             return
         1017 
         1018         # all tags must be in a restricted subset of valid HTML tags
         1019         if filter(lambda t: t.lower() not in _HTMLSanitizer.acceptable_elements,
         1020             re.findall(r'</?(\w+)',s)):
         1021             return
         1022 
         1023         # all entities must have been defined as valid HTML entities
         1024         if filter(lambda e: e not in entitydefs.keys(), re.findall(r'&(\w+);', s)):
         1025             return
         1026 
         1027         return 1
         1028 
         1029     def _mapToStandardPrefix(self, name):
         1030         colonpos = name.find(':')
         1031         if colonpos <> -1:
         1032             prefix = name[:colonpos]
         1033             suffix = name[colonpos+1:]
         1034             prefix = self.namespacemap.get(prefix, prefix)
         1035             name = prefix + ':' + suffix
         1036         return name
         1037 
         1038     def _getAttribute(self, attrsD, name):
         1039         return attrsD.get(self._mapToStandardPrefix(name))
         1040 
         1041     def _isBase64(self, attrsD, contentparams):
         1042         if attrsD.get('mode', '') == 'base64':
         1043             return 1
         1044         if self.contentparams['type'].startswith(u'text/'):
         1045             return 0
         1046         if self.contentparams['type'].endswith(u'+xml'):
         1047             return 0
         1048         if self.contentparams['type'].endswith(u'/xml'):
         1049             return 0
         1050         return 1
         1051 
         1052     def _itsAnHrefDamnIt(self, attrsD):
         1053         href = attrsD.get('url', attrsD.get('uri', attrsD.get('href', None)))
         1054         if href:
         1055             try:
         1056                 del attrsD['url']
         1057             except KeyError:
         1058                 pass
         1059             try:
         1060                 del attrsD['uri']
         1061             except KeyError:
         1062                 pass
         1063             attrsD['href'] = href
         1064         return attrsD
         1065 
         1066     def _save(self, key, value, overwrite=False):
         1067         context = self._getContext()
         1068         if overwrite:
         1069             context[key] = value
         1070         else:
         1071             context.setdefault(key, value)
         1072 
         1073     def _start_rss(self, attrsD):
         1074         versionmap = {'0.91': u'rss091u',
         1075                       '0.92': u'rss092',
         1076                       '0.93': u'rss093',
         1077                       '0.94': u'rss094'}
         1078         #If we're here then this is an RSS feed.
         1079         #If we don't have a version or have a version that starts with something
         1080         #other than RSS then there's been a mistake. Correct it.
         1081         if not self.version or not self.version.startswith(u'rss'):
         1082             attr_version = attrsD.get('version', '')
         1083             version = versionmap.get(attr_version)
         1084             if version:
         1085                 self.version = version
         1086             elif attr_version.startswith('2.'):
         1087                 self.version = u'rss20'
         1088             else:
         1089                 self.version = u'rss'
         1090 
         1091     def _start_channel(self, attrsD):
         1092         self.infeed = 1
         1093         self._cdf_common(attrsD)
         1094 
         1095     def _cdf_common(self, attrsD):
         1096         if 'lastmod' in attrsD:
         1097             self._start_modified({})
         1098             self.elementstack[-1][-1] = attrsD['lastmod']
         1099             self._end_modified()
         1100         if 'href' in attrsD:
         1101             self._start_link({})
         1102             self.elementstack[-1][-1] = attrsD['href']
         1103             self._end_link()
         1104 
         1105     def _start_feed(self, attrsD):
         1106         self.infeed = 1
         1107         versionmap = {'0.1': u'atom01',
         1108                       '0.2': u'atom02',
         1109                       '0.3': u'atom03'}
         1110         if not self.version:
         1111             attr_version = attrsD.get('version')
         1112             version = versionmap.get(attr_version)
         1113             if version:
         1114                 self.version = version
         1115             else:
         1116                 self.version = u'atom'
         1117 
         1118     def _end_channel(self):
         1119         self.infeed = 0
         1120     _end_feed = _end_channel
         1121 
         1122     def _start_image(self, attrsD):
         1123         context = self._getContext()
         1124         if not self.inentry:
         1125             context.setdefault('image', FeedParserDict())
         1126         self.inimage = 1
         1127         self.title_depth = -1
         1128         self.push('image', 0)
         1129 
         1130     def _end_image(self):
         1131         self.pop('image')
         1132         self.inimage = 0
         1133 
         1134     def _start_textinput(self, attrsD):
         1135         context = self._getContext()
         1136         context.setdefault('textinput', FeedParserDict())
         1137         self.intextinput = 1
         1138         self.title_depth = -1
         1139         self.push('textinput', 0)
         1140     _start_textInput = _start_textinput
         1141 
         1142     def _end_textinput(self):
         1143         self.pop('textinput')
         1144         self.intextinput = 0
         1145     _end_textInput = _end_textinput
         1146 
         1147     def _start_author(self, attrsD):
         1148         self.inauthor = 1
         1149         self.push('author', 1)
         1150         # Append a new FeedParserDict when expecting an author
         1151         context = self._getContext()
         1152         context.setdefault('authors', [])
         1153         context['authors'].append(FeedParserDict())
         1154     _start_managingeditor = _start_author
         1155     _start_dc_author = _start_author
         1156     _start_dc_creator = _start_author
         1157     _start_itunes_author = _start_author
         1158 
         1159     def _end_author(self):
         1160         self.pop('author')
         1161         self.inauthor = 0
         1162         self._sync_author_detail()
         1163     _end_managingeditor = _end_author
         1164     _end_dc_author = _end_author
         1165     _end_dc_creator = _end_author
         1166     _end_itunes_author = _end_author
         1167 
         1168     def _start_itunes_owner(self, attrsD):
         1169         self.inpublisher = 1
         1170         self.push('publisher', 0)
         1171 
         1172     def _end_itunes_owner(self):
         1173         self.pop('publisher')
         1174         self.inpublisher = 0
         1175         self._sync_author_detail('publisher')
         1176 
         1177     def _start_contributor(self, attrsD):
         1178         self.incontributor = 1
         1179         context = self._getContext()
         1180         context.setdefault('contributors', [])
         1181         context['contributors'].append(FeedParserDict())
         1182         self.push('contributor', 0)
         1183 
         1184     def _end_contributor(self):
         1185         self.pop('contributor')
         1186         self.incontributor = 0
         1187 
         1188     def _start_dc_contributor(self, attrsD):
         1189         self.incontributor = 1
         1190         context = self._getContext()
         1191         context.setdefault('contributors', [])
         1192         context['contributors'].append(FeedParserDict())
         1193         self.push('name', 0)
         1194 
         1195     def _end_dc_contributor(self):
         1196         self._end_name()
         1197         self.incontributor = 0
         1198 
         1199     def _start_name(self, attrsD):
         1200         self.push('name', 0)
         1201     _start_itunes_name = _start_name
         1202 
         1203     def _end_name(self):
         1204         value = self.pop('name')
         1205         if self.inpublisher:
         1206             self._save_author('name', value, 'publisher')
         1207         elif self.inauthor:
         1208             self._save_author('name', value)
         1209         elif self.incontributor:
         1210             self._save_contributor('name', value)
         1211         elif self.intextinput:
         1212             context = self._getContext()
         1213             context['name'] = value
         1214     _end_itunes_name = _end_name
         1215 
         1216     def _start_width(self, attrsD):
         1217         self.push('width', 0)
         1218 
         1219     def _end_width(self):
         1220         value = self.pop('width')
         1221         try:
         1222             value = int(value)
         1223         except ValueError:
         1224             value = 0
         1225         if self.inimage:
         1226             context = self._getContext()
         1227             context['width'] = value
         1228 
         1229     def _start_height(self, attrsD):
         1230         self.push('height', 0)
         1231 
         1232     def _end_height(self):
         1233         value = self.pop('height')
         1234         try:
         1235             value = int(value)
         1236         except ValueError:
         1237             value = 0
         1238         if self.inimage:
         1239             context = self._getContext()
         1240             context['height'] = value
         1241 
         1242     def _start_url(self, attrsD):
         1243         self.push('href', 1)
         1244     _start_homepage = _start_url
         1245     _start_uri = _start_url
         1246 
         1247     def _end_url(self):
         1248         value = self.pop('href')
         1249         if self.inauthor:
         1250             self._save_author('href', value)
         1251         elif self.incontributor:
         1252             self._save_contributor('href', value)
         1253     _end_homepage = _end_url
         1254     _end_uri = _end_url
         1255 
         1256     def _start_email(self, attrsD):
         1257         self.push('email', 0)
         1258     _start_itunes_email = _start_email
         1259 
         1260     def _end_email(self):
         1261         value = self.pop('email')
         1262         if self.inpublisher:
         1263             self._save_author('email', value, 'publisher')
         1264         elif self.inauthor:
         1265             self._save_author('email', value)
         1266         elif self.incontributor:
         1267             self._save_contributor('email', value)
         1268     _end_itunes_email = _end_email
         1269 
         1270     def _getContext(self):
         1271         if self.insource:
         1272             context = self.sourcedata
         1273         elif self.inimage and 'image' in self.feeddata:
         1274             context = self.feeddata['image']
         1275         elif self.intextinput:
         1276             context = self.feeddata['textinput']
         1277         elif self.inentry:
         1278             context = self.entries[-1]
         1279         else:
         1280             context = self.feeddata
         1281         return context
         1282 
         1283     def _save_author(self, key, value, prefix='author'):
         1284         context = self._getContext()
         1285         context.setdefault(prefix + '_detail', FeedParserDict())
         1286         context[prefix + '_detail'][key] = value
         1287         self._sync_author_detail()
         1288         context.setdefault('authors', [FeedParserDict()])
         1289         context['authors'][-1][key] = value
         1290 
         1291     def _save_contributor(self, key, value):
         1292         context = self._getContext()
         1293         context.setdefault('contributors', [FeedParserDict()])
         1294         context['contributors'][-1][key] = value
         1295 
         1296     def _sync_author_detail(self, key='author'):
         1297         context = self._getContext()
         1298         detail = context.get('%ss' % key, [FeedParserDict()])[-1]
         1299         if detail:
         1300             name = detail.get('name')
         1301             email = detail.get('email')
         1302             if name and email:
         1303                 context[key] = u'%s (%s)' % (name, email)
         1304             elif name:
         1305                 context[key] = name
         1306             elif email:
         1307                 context[key] = email
         1308         else:
         1309             author, email = context.get(key), None
         1310             if not author:
         1311                 return
         1312             emailmatch = re.search(ur'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))(\?subject=\S+)?''', author)
         1313             if emailmatch:
         1314                 email = emailmatch.group(0)
         1315                 # probably a better way to do the following, but it passes all the tests
         1316                 author = author.replace(email, u'')
         1317                 author = author.replace(u'()', u'')
         1318                 author = author.replace(u'<>', u'')
         1319                 author = author.replace(u'&lt;&gt;', u'')
         1320                 author = author.strip()
         1321                 if author and (author[0] == u'('):
         1322                     author = author[1:]
         1323                 if author and (author[-1] == u')'):
         1324                     author = author[:-1]
         1325                 author = author.strip()
         1326             if author or email:
         1327                 context.setdefault('%s_detail' % key, detail)
         1328             if author:
         1329                 detail['name'] = author
         1330             if email:
         1331                 detail['email'] = email
         1332 
         1333     def _start_subtitle(self, attrsD):
         1334         self.pushContent('subtitle', attrsD, u'text/plain', 1)
         1335     _start_tagline = _start_subtitle
         1336     _start_itunes_subtitle = _start_subtitle
         1337 
         1338     def _end_subtitle(self):
         1339         self.popContent('subtitle')
         1340     _end_tagline = _end_subtitle
         1341     _end_itunes_subtitle = _end_subtitle
         1342 
         1343     def _start_rights(self, attrsD):
         1344         self.pushContent('rights', attrsD, u'text/plain', 1)
         1345     _start_dc_rights = _start_rights
         1346     _start_copyright = _start_rights
         1347 
         1348     def _end_rights(self):
         1349         self.popContent('rights')
         1350     _end_dc_rights = _end_rights
         1351     _end_copyright = _end_rights
         1352 
         1353     def _start_item(self, attrsD):
         1354         self.entries.append(FeedParserDict())
         1355         self.push('item', 0)
         1356         self.inentry = 1
         1357         self.guidislink = 0
         1358         self.title_depth = -1
         1359         self.psc_chapters_flag = None
         1360         id = self._getAttribute(attrsD, 'rdf:about')
         1361         if id:
         1362             context = self._getContext()
         1363             context['id'] = id
         1364         self._cdf_common(attrsD)
         1365     _start_entry = _start_item
         1366 
         1367     def _end_item(self):
         1368         self.pop('item')
         1369         self.inentry = 0
         1370     _end_entry = _end_item
         1371 
         1372     def _start_dc_language(self, attrsD):
         1373         self.push('language', 1)
         1374     _start_language = _start_dc_language
         1375 
         1376     def _end_dc_language(self):
         1377         self.lang = self.pop('language')
         1378     _end_language = _end_dc_language
         1379 
         1380     def _start_dc_publisher(self, attrsD):
         1381         self.push('publisher', 1)
         1382     _start_webmaster = _start_dc_publisher
         1383 
         1384     def _end_dc_publisher(self):
         1385         self.pop('publisher')
         1386         self._sync_author_detail('publisher')
         1387     _end_webmaster = _end_dc_publisher
         1388 
         1389     def _start_dcterms_valid(self, attrsD):
         1390         self.push('validity', 1)
         1391 
         1392     def _end_dcterms_valid(self):
         1393         for validity_detail in self.pop('validity').split(';'):
         1394             if '=' in validity_detail:
         1395                 key, value = validity_detail.split('=', 1)
         1396                 if key == 'start':
         1397                     self._save('validity_start', value, overwrite=True)
         1398                     self._save('validity_start_parsed', _parse_date(value), overwrite=True)
         1399                 elif key == 'end':
         1400                     self._save('validity_end', value, overwrite=True)
         1401                     self._save('validity_end_parsed', _parse_date(value), overwrite=True)
         1402 
         1403     def _start_published(self, attrsD):
         1404         self.push('published', 1)
         1405     _start_dcterms_issued = _start_published
         1406     _start_issued = _start_published
         1407     _start_pubdate = _start_published
         1408 
         1409     def _end_published(self):
         1410         value = self.pop('published')
         1411         self._save('published_parsed', _parse_date(value), overwrite=True)
         1412     _end_dcterms_issued = _end_published
         1413     _end_issued = _end_published
         1414     _end_pubdate = _end_published
         1415 
         1416     def _start_updated(self, attrsD):
         1417         self.push('updated', 1)
         1418     _start_modified = _start_updated
         1419     _start_dcterms_modified = _start_updated
         1420     _start_dc_date = _start_updated
         1421     _start_lastbuilddate = _start_updated
         1422 
         1423     def _end_updated(self):
         1424         value = self.pop('updated')
         1425         parsed_value = _parse_date(value)
         1426         self._save('updated_parsed', parsed_value, overwrite=True)
         1427     _end_modified = _end_updated
         1428     _end_dcterms_modified = _end_updated
         1429     _end_dc_date = _end_updated
         1430     _end_lastbuilddate = _end_updated
         1431 
         1432     def _start_created(self, attrsD):
         1433         self.push('created', 1)
         1434     _start_dcterms_created = _start_created
         1435 
         1436     def _end_created(self):
         1437         value = self.pop('created')
         1438         self._save('created_parsed', _parse_date(value), overwrite=True)
         1439     _end_dcterms_created = _end_created
         1440 
         1441     def _start_expirationdate(self, attrsD):
         1442         self.push('expired', 1)
         1443 
         1444     def _end_expirationdate(self):
         1445         self._save('expired_parsed', _parse_date(self.pop('expired')), overwrite=True)
         1446 
         1447     # geospatial location, or "where", from georss.org
         1448 
         1449     def _start_georssgeom(self, attrsD):
         1450         self.push('geometry', 0)
         1451         context = self._getContext()
         1452         context['where'] = FeedParserDict()
         1453 
         1454     _start_georss_point = _start_georssgeom
         1455     _start_georss_line = _start_georssgeom
         1456     _start_georss_polygon = _start_georssgeom
         1457     _start_georss_box = _start_georssgeom
         1458 
         1459     def _save_where(self, geometry):
         1460         context = self._getContext()
         1461         context['where'].update(geometry)
         1462 
         1463     def _end_georss_point(self):
         1464         geometry = _parse_georss_point(self.pop('geometry'))
         1465         if geometry:
         1466             self._save_where(geometry)
         1467 
         1468     def _end_georss_line(self):
         1469         geometry = _parse_georss_line(self.pop('geometry'))
         1470         if geometry:
         1471             self._save_where(geometry)
         1472 
         1473     def _end_georss_polygon(self):
         1474         this = self.pop('geometry')
         1475         geometry = _parse_georss_polygon(this)
         1476         if geometry:
         1477             self._save_where(geometry)
         1478 
         1479     def _end_georss_box(self):
         1480         geometry = _parse_georss_box(self.pop('geometry'))
         1481         if geometry:
         1482             self._save_where(geometry)
         1483 
         1484     def _start_where(self, attrsD):
         1485         self.push('where', 0)
         1486         context = self._getContext()
         1487         context['where'] = FeedParserDict()
         1488     _start_georss_where = _start_where
         1489 
         1490     def _parse_srs_attrs(self, attrsD):
         1491         srsName = attrsD.get('srsname')
         1492         try:
         1493             srsDimension = int(attrsD.get('srsdimension', '2'))
         1494         except ValueError:
         1495             srsDimension = 2
         1496         context = self._getContext()
         1497         context['where']['srsName'] = srsName
         1498         context['where']['srsDimension'] = srsDimension
         1499 
         1500     def _start_gml_point(self, attrsD):
         1501         self._parse_srs_attrs(attrsD)
         1502         self.ingeometry = 1
         1503         self.push('geometry', 0)
         1504 
         1505     def _start_gml_linestring(self, attrsD):
         1506         self._parse_srs_attrs(attrsD)
         1507         self.ingeometry = 'linestring'
         1508         self.push('geometry', 0)
         1509 
         1510     def _start_gml_polygon(self, attrsD):
         1511         self._parse_srs_attrs(attrsD)
         1512         self.push('geometry', 0)
         1513 
         1514     def _start_gml_exterior(self, attrsD):
         1515         self.push('geometry', 0)
         1516 
         1517     def _start_gml_linearring(self, attrsD):
         1518         self.ingeometry = 'polygon'
         1519         self.push('geometry', 0)
         1520 
         1521     def _start_gml_pos(self, attrsD):
         1522         self.push('pos', 0)
         1523 
         1524     def _end_gml_pos(self):
         1525         this = self.pop('pos')
         1526         context = self._getContext()
         1527         srsName = context['where'].get('srsName')
         1528         srsDimension = context['where'].get('srsDimension', 2)
         1529         swap = True
         1530         if srsName and "EPSG" in srsName:
         1531             epsg = int(srsName.split(":")[-1])
         1532             swap = bool(epsg in _geogCS)
         1533         geometry = _parse_georss_point(this, swap=swap, dims=srsDimension)
         1534         if geometry:
         1535             self._save_where(geometry)
         1536 
         1537     def _start_gml_poslist(self, attrsD):
         1538         self.push('pos', 0)
         1539 
         1540     def _end_gml_poslist(self):
         1541         this = self.pop('pos')
         1542         context = self._getContext()
         1543         srsName = context['where'].get('srsName')
         1544         srsDimension = context['where'].get('srsDimension', 2)
         1545         swap = True
         1546         if srsName and "EPSG" in srsName:
         1547             epsg = int(srsName.split(":")[-1])
         1548             swap = bool(epsg in _geogCS)
         1549         geometry = _parse_poslist(
         1550             this, self.ingeometry, swap=swap, dims=srsDimension)
         1551         if geometry:
         1552             self._save_where(geometry)
         1553 
         1554     def _end_geom(self):
         1555         self.ingeometry = 0
         1556         self.pop('geometry')
         1557     _end_gml_point = _end_geom
         1558     _end_gml_linestring = _end_geom
         1559     _end_gml_linearring = _end_geom
         1560     _end_gml_exterior = _end_geom
         1561     _end_gml_polygon = _end_geom
         1562 
         1563     def _end_where(self):
         1564         self.pop('where')
         1565     _end_georss_where = _end_where
         1566 
         1567     # end geospatial
         1568 
         1569     def _start_cc_license(self, attrsD):
         1570         context = self._getContext()
         1571         value = self._getAttribute(attrsD, 'rdf:resource')
         1572         attrsD = FeedParserDict()
         1573         attrsD['rel'] = u'license'
         1574         if value:
         1575             attrsD['href']=value
         1576         context.setdefault('links', []).append(attrsD)
         1577 
         1578     def _start_creativecommons_license(self, attrsD):
         1579         self.push('license', 1)
         1580     _start_creativeCommons_license = _start_creativecommons_license
         1581 
         1582     def _end_creativecommons_license(self):
         1583         value = self.pop('license')
         1584         context = self._getContext()
         1585         attrsD = FeedParserDict()
         1586         attrsD['rel'] = u'license'
         1587         if value:
         1588             attrsD['href'] = value
         1589         context.setdefault('links', []).append(attrsD)
         1590         del context['license']
         1591     _end_creativeCommons_license = _end_creativecommons_license
         1592 
         1593     def _addTag(self, term, scheme, label):
         1594         context = self._getContext()
         1595         tags = context.setdefault('tags', [])
         1596         if (not term) and (not scheme) and (not label):
         1597             return
         1598         value = FeedParserDict(term=term, scheme=scheme, label=label)
         1599         if value not in tags:
         1600             tags.append(value)
         1601 
         1602     def _start_tags(self, attrsD):
         1603         # This is a completely-made up element. Its semantics are determined
         1604         # only by a single feed that precipitated bug report 392 on Google Code.
         1605         # In short, this is junk code.
         1606         self.push('tags', 1)
         1607 
         1608     def _end_tags(self):
         1609         for term in self.pop('tags').split(','):
         1610             self._addTag(term.strip(), None, None)
         1611 
         1612     def _start_category(self, attrsD):
         1613         term = attrsD.get('term')
         1614         scheme = attrsD.get('scheme', attrsD.get('domain'))
         1615         label = attrsD.get('label')
         1616         self._addTag(term, scheme, label)
         1617         self.push('category', 1)
         1618     _start_dc_subject = _start_category
         1619     _start_keywords = _start_category
         1620 
         1621     def _start_media_category(self, attrsD):
         1622         attrsD.setdefault('scheme', u'http://search.yahoo.com/mrss/category_schema')
         1623         self._start_category(attrsD)
         1624 
         1625     def _end_itunes_keywords(self):
         1626         for term in self.pop('itunes_keywords').split(','):
         1627             if term.strip():
         1628                 self._addTag(term.strip(), u'http://www.itunes.com/', None)
         1629 
         1630     def _end_media_keywords(self):
         1631         for term in self.pop('media_keywords').split(','):
         1632             if term.strip():
         1633                 self._addTag(term.strip(), None, None)
         1634 
         1635     def _start_itunes_category(self, attrsD):
         1636         self._addTag(attrsD.get('text'), u'http://www.itunes.com/', None)
         1637         self.push('category', 1)
         1638 
         1639     def _end_category(self):
         1640         value = self.pop('category')
         1641         if not value:
         1642             return
         1643         context = self._getContext()
         1644         tags = context['tags']
         1645         if value and len(tags) and not tags[-1]['term']:
         1646             tags[-1]['term'] = value
         1647         else:
         1648             self._addTag(value, None, None)
         1649     _end_dc_subject = _end_category
         1650     _end_keywords = _end_category
         1651     _end_itunes_category = _end_category
         1652     _end_media_category = _end_category
         1653 
         1654     def _start_cloud(self, attrsD):
         1655         self._getContext()['cloud'] = FeedParserDict(attrsD)
         1656 
         1657     def _start_link(self, attrsD):
         1658         attrsD.setdefault('rel', u'alternate')
         1659         if attrsD['rel'] == u'self':
         1660             attrsD.setdefault('type', u'application/atom+xml')
         1661         else:
         1662             attrsD.setdefault('type', u'text/html')
         1663         context = self._getContext()
         1664         attrsD = self._itsAnHrefDamnIt(attrsD)
         1665         if 'href' in attrsD:
         1666             attrsD['href'] = self.resolveURI(attrsD['href'])
         1667         expectingText = self.infeed or self.inentry or self.insource
         1668         context.setdefault('links', [])
         1669         if not (self.inentry and self.inimage):
         1670             context['links'].append(FeedParserDict(attrsD))
         1671         if 'href' in attrsD:
         1672             expectingText = 0
         1673             if (attrsD.get('rel') == u'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types):
         1674                 context['link'] = attrsD['href']
         1675         else:
         1676             self.push('link', expectingText)
         1677 
         1678     def _end_link(self):
         1679         value = self.pop('link')
         1680 
         1681     def _start_guid(self, attrsD):
         1682         self.guidislink = (attrsD.get('ispermalink', 'true') == 'true')
         1683         self.push('id', 1)
         1684     _start_id = _start_guid
         1685 
         1686     def _end_guid(self):
         1687         value = self.pop('id')
         1688         self._save('guidislink', self.guidislink and 'link' not in self._getContext())
         1689         if self.guidislink:
         1690             # guid acts as link, but only if 'ispermalink' is not present or is 'true',
         1691             # and only if the item doesn't already have a link element
         1692             self._save('link', value)
         1693     _end_id = _end_guid
         1694 
         1695     def _start_title(self, attrsD):
         1696         if self.svgOK:
         1697             return self.unknown_starttag('title', attrsD.items())
         1698         self.pushContent('title', attrsD, u'text/plain', self.infeed or self.inentry or self.insource)
         1699     _start_dc_title = _start_title
         1700     _start_media_title = _start_title
         1701 
         1702     def _end_title(self):
         1703         if self.svgOK:
         1704             return
         1705         value = self.popContent('title')
         1706         if not value:
         1707             return
         1708         self.title_depth = self.depth
         1709     _end_dc_title = _end_title
         1710 
         1711     def _end_media_title(self):
         1712         title_depth = self.title_depth
         1713         self._end_title()
         1714         self.title_depth = title_depth
         1715 
         1716     def _start_description(self, attrsD):
         1717         context = self._getContext()
         1718         if 'summary' in context:
         1719             self._summaryKey = 'content'
         1720             self._start_content(attrsD)
         1721         else:
         1722             self.pushContent('description', attrsD, u'text/html', self.infeed or self.inentry or self.insource)
         1723     _start_dc_description = _start_description
         1724     _start_media_description = _start_description
         1725 
         1726     def _start_abstract(self, attrsD):
         1727         self.pushContent('description', attrsD, u'text/plain', self.infeed or self.inentry or self.insource)
         1728 
         1729     def _end_description(self):
         1730         if self._summaryKey == 'content':
         1731             self._end_content()
         1732         else:
         1733             value = self.popContent('description')
         1734         self._summaryKey = None
         1735     _end_abstract = _end_description
         1736     _end_dc_description = _end_description
         1737     _end_media_description = _end_description
         1738 
         1739     def _start_info(self, attrsD):
         1740         self.pushContent('info', attrsD, u'text/plain', 1)
         1741     _start_feedburner_browserfriendly = _start_info
         1742 
         1743     def _end_info(self):
         1744         self.popContent('info')
         1745     _end_feedburner_browserfriendly = _end_info
         1746 
         1747     def _start_generator(self, attrsD):
         1748         if attrsD:
         1749             attrsD = self._itsAnHrefDamnIt(attrsD)
         1750             if 'href' in attrsD:
         1751                 attrsD['href'] = self.resolveURI(attrsD['href'])
         1752         self._getContext()['generator_detail'] = FeedParserDict(attrsD)
         1753         self.push('generator', 1)
         1754 
         1755     def _end_generator(self):
         1756         value = self.pop('generator')
         1757         context = self._getContext()
         1758         if 'generator_detail' in context:
         1759             context['generator_detail']['name'] = value
         1760 
         1761     def _start_admin_generatoragent(self, attrsD):
         1762         self.push('generator', 1)
         1763         value = self._getAttribute(attrsD, 'rdf:resource')
         1764         if value:
         1765             self.elementstack[-1][2].append(value)
         1766         self.pop('generator')
         1767         self._getContext()['generator_detail'] = FeedParserDict({'href': value})
         1768 
         1769     def _start_admin_errorreportsto(self, attrsD):
         1770         self.push('errorreportsto', 1)
         1771         value = self._getAttribute(attrsD, 'rdf:resource')
         1772         if value:
         1773             self.elementstack[-1][2].append(value)
         1774         self.pop('errorreportsto')
         1775 
         1776     def _start_summary(self, attrsD):
         1777         context = self._getContext()
         1778         if 'summary' in context:
         1779             self._summaryKey = 'content'
         1780             self._start_content(attrsD)
         1781         else:
         1782             self._summaryKey = 'summary'
         1783             self.pushContent(self._summaryKey, attrsD, u'text/plain', 1)
         1784     _start_itunes_summary = _start_summary
         1785 
         1786     def _end_summary(self):
         1787         if self._summaryKey == 'content':
         1788             self._end_content()
         1789         else:
         1790             self.popContent(self._summaryKey or 'summary')
         1791         self._summaryKey = None
         1792     _end_itunes_summary = _end_summary
         1793 
         1794     def _start_enclosure(self, attrsD):
         1795         attrsD = self._itsAnHrefDamnIt(attrsD)
         1796         context = self._getContext()
         1797         attrsD['rel'] = u'enclosure'
         1798         context.setdefault('links', []).append(FeedParserDict(attrsD))
         1799 
         1800     def _start_source(self, attrsD):
         1801         if 'url' in attrsD:
         1802             # This means that we're processing a source element from an RSS 2.0 feed
         1803             self.sourcedata['href'] = attrsD[u'url']
         1804         self.push('source', 1)
         1805         self.insource = 1
         1806         self.title_depth = -1
         1807 
         1808     def _end_source(self):
         1809         self.insource = 0
         1810         value = self.pop('source')
         1811         if value:
         1812             self.sourcedata['title'] = value
         1813         self._getContext()['source'] = copy.deepcopy(self.sourcedata)
         1814         self.sourcedata.clear()
         1815 
         1816     def _start_content(self, attrsD):
         1817         self.pushContent('content', attrsD, u'text/plain', 1)
         1818         src = attrsD.get('src')
         1819         if src:
         1820             self.contentparams['src'] = src
         1821         self.push('content', 1)
         1822 
         1823     def _start_body(self, attrsD):
         1824         self.pushContent('content', attrsD, u'application/xhtml+xml', 1)
         1825     _start_xhtml_body = _start_body
         1826 
         1827     def _start_content_encoded(self, attrsD):
         1828         self.pushContent('content', attrsD, u'text/html', 1)
         1829     _start_fullitem = _start_content_encoded
         1830 
         1831     def _end_content(self):
         1832         copyToSummary = self.mapContentType(self.contentparams.get('type')) in ([u'text/plain'] + self.html_types)
         1833         value = self.popContent('content')
         1834         if copyToSummary:
         1835             self._save('summary', value)
         1836 
         1837     _end_body = _end_content
         1838     _end_xhtml_body = _end_content
         1839     _end_content_encoded = _end_content
         1840     _end_fullitem = _end_content
         1841 
         1842     def _start_itunes_image(self, attrsD):
         1843         self.push('itunes_image', 0)
         1844         if attrsD.get('href'):
         1845             self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')})
         1846         elif attrsD.get('url'):
         1847             self._getContext()['image'] = FeedParserDict({'href': attrsD.get('url')})
         1848     _start_itunes_link = _start_itunes_image
         1849 
         1850     def _end_itunes_block(self):
         1851         value = self.pop('itunes_block', 0)
         1852         self._getContext()['itunes_block'] = (value == 'yes') and 1 or 0
         1853 
         1854     def _end_itunes_explicit(self):
         1855         value = self.pop('itunes_explicit', 0)
         1856         # Convert 'yes' -> True, 'clean' to False, and any other value to None
         1857         # False and None both evaluate as False, so the difference can be ignored
         1858         # by applications that only need to know if the content is explicit.
         1859         self._getContext()['itunes_explicit'] = (None, False, True)[(value == 'yes' and 2) or value == 'clean' or 0]
         1860 
         1861     def _start_media_group(self, attrsD):
         1862         # don't do anything, but don't break the enclosed tags either
         1863         pass
         1864 
         1865     def _start_media_rating(self, attrsD):
         1866         context = self._getContext()
         1867         context.setdefault('media_rating', attrsD)
         1868         self.push('rating', 1)
         1869 
         1870     def _end_media_rating(self):
         1871         rating = self.pop('rating')
         1872         if rating is not None and rating.strip():
         1873             context = self._getContext()
         1874             context['media_rating']['content'] = rating
         1875 
         1876     def _start_media_credit(self, attrsD):
         1877         context = self._getContext()
         1878         context.setdefault('media_credit', [])
         1879         context['media_credit'].append(attrsD)
         1880         self.push('credit', 1)
         1881 
         1882     def _end_media_credit(self):
         1883         credit = self.pop('credit')
         1884         if credit != None and len(credit.strip()) != 0:
         1885             context = self._getContext()
         1886             context['media_credit'][-1]['content'] = credit
         1887 
         1888     def _start_media_restriction(self, attrsD):
         1889         context = self._getContext()
         1890         context.setdefault('media_restriction', attrsD)
         1891         self.push('restriction', 1)
         1892 
         1893     def _end_media_restriction(self):
         1894         restriction = self.pop('restriction')
         1895         if restriction != None and len(restriction.strip()) != 0:
         1896             context = self._getContext()
         1897             context['media_restriction']['content'] = [cc.strip().lower() for cc in restriction.split(' ')]
         1898 
         1899     def _start_media_license(self, attrsD):
         1900         context = self._getContext()
         1901         context.setdefault('media_license', attrsD)
         1902         self.push('license', 1)
         1903 
         1904     def _end_media_license(self):
         1905         license = self.pop('license')
         1906         if license != None and len(license.strip()) != 0:
         1907             context = self._getContext()
         1908             context['media_license']['content'] = license
         1909 
         1910     def _start_media_content(self, attrsD):
         1911         context = self._getContext()
         1912         context.setdefault('media_content', [])
         1913         context['media_content'].append(attrsD)
         1914 
         1915     def _start_media_thumbnail(self, attrsD):
         1916         context = self._getContext()
         1917         context.setdefault('media_thumbnail', [])
         1918         self.push('url', 1) # new
         1919         context['media_thumbnail'].append(attrsD)
         1920 
         1921     def _end_media_thumbnail(self):
         1922         url = self.pop('url')
         1923         context = self._getContext()
         1924         if url != None and len(url.strip()) != 0:
         1925             if 'url' not in context['media_thumbnail'][-1]:
         1926                 context['media_thumbnail'][-1]['url'] = url
         1927 
         1928     def _start_media_player(self, attrsD):
         1929         self.push('media_player', 0)
         1930         self._getContext()['media_player'] = FeedParserDict(attrsD)
         1931 
         1932     def _end_media_player(self):
         1933         value = self.pop('media_player')
         1934         context = self._getContext()
         1935         context['media_player']['content'] = value
         1936 
         1937     def _start_newlocation(self, attrsD):
         1938         self.push('newlocation', 1)
         1939 
         1940     def _end_newlocation(self):
         1941         url = self.pop('newlocation')
         1942         context = self._getContext()
         1943         # don't set newlocation if the context isn't right
         1944         if context is not self.feeddata:
         1945             return
         1946         context['newlocation'] = _makeSafeAbsoluteURI(self.baseuri, url.strip())
         1947 
         1948     def _start_psc_chapters(self, attrsD):
         1949         if self.psc_chapters_flag is None:
         1950             # Transition from None -> True
         1951             self.psc_chapters_flag = True
         1952             attrsD['chapters'] = []
         1953             self._getContext()['psc_chapters'] = FeedParserDict(attrsD)
         1954 
         1955     def _end_psc_chapters(self):
         1956         # Transition from True -> False
         1957         self.psc_chapters_flag = False
         1958 
         1959     def _start_psc_chapter(self, attrsD):
         1960         if self.psc_chapters_flag:
         1961             start = self._getAttribute(attrsD, 'start')
         1962             attrsD['start_parsed'] = _parse_psc_chapter_start(start)
         1963 
         1964             context = self._getContext()['psc_chapters']
         1965             context['chapters'].append(FeedParserDict(attrsD))
         1966 
         1967 
         1968 if _XML_AVAILABLE:
         1969     class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler):
         1970         def __init__(self, baseuri, baselang, encoding):
         1971             xml.sax.handler.ContentHandler.__init__(self)
         1972             _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
         1973             self.bozo = 0
         1974             self.exc = None
         1975             self.decls = {}
         1976 
         1977         def startPrefixMapping(self, prefix, uri):
         1978             if not uri:
         1979                 return
         1980             # Jython uses '' instead of None; standardize on None
         1981             prefix = prefix or None
         1982             self.trackNamespace(prefix, uri)
         1983             if prefix and uri == 'http://www.w3.org/1999/xlink':
         1984                 self.decls['xmlns:' + prefix] = uri
         1985 
         1986         def startElementNS(self, name, qname, attrs):
         1987             namespace, localname = name
         1988             lowernamespace = str(namespace or '').lower()
         1989             if lowernamespace.find(u'backend.userland.com/rss') <> -1:
         1990                 # match any backend.userland.com namespace
         1991                 namespace = u'http://backend.userland.com/rss'
         1992                 lowernamespace = namespace
         1993             if qname and qname.find(':') > 0:
         1994                 givenprefix = qname.split(':')[0]
         1995             else:
         1996                 givenprefix = None
         1997             prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
         1998             if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and givenprefix not in self.namespacesInUse:
         1999                 raise UndeclaredNamespace, "'%s' is not associated with a namespace" % givenprefix
         2000             localname = str(localname).lower()
         2001 
         2002             # qname implementation is horribly broken in Python 2.1 (it
         2003             # doesn't report any), and slightly broken in Python 2.2 (it
         2004             # doesn't report the xml: namespace). So we match up namespaces
         2005             # with a known list first, and then possibly override them with
         2006             # the qnames the SAX parser gives us (if indeed it gives us any
         2007             # at all).  Thanks to MatejC for helping me test this and
         2008             # tirelessly telling me that it didn't work yet.
         2009             attrsD, self.decls = self.decls, {}
         2010             if localname=='math' and namespace=='http://www.w3.org/1998/Math/MathML':
         2011                 attrsD['xmlns']=namespace
         2012             if localname=='svg' and namespace=='http://www.w3.org/2000/svg':
         2013                 attrsD['xmlns']=namespace
         2014 
         2015             if prefix:
         2016                 localname = prefix.lower() + ':' + localname
         2017             elif namespace and not qname: #Expat
         2018                 for name,value in self.namespacesInUse.items():
         2019                     if name and value == namespace:
         2020                         localname = name + ':' + localname
         2021                         break
         2022 
         2023             for (namespace, attrlocalname), attrvalue in attrs.items():
         2024                 lowernamespace = (namespace or '').lower()
         2025                 prefix = self._matchnamespaces.get(lowernamespace, '')
         2026                 if prefix:
         2027                     attrlocalname = prefix + ':' + attrlocalname
         2028                 attrsD[str(attrlocalname).lower()] = attrvalue
         2029             for qname in attrs.getQNames():
         2030                 attrsD[str(qname).lower()] = attrs.getValueByQName(qname)
         2031             localname = str(localname).lower()
         2032             self.unknown_starttag(localname, attrsD.items())
         2033 
         2034         def characters(self, text):
         2035             self.handle_data(text)
         2036 
         2037         def endElementNS(self, name, qname):
         2038             namespace, localname = name
         2039             lowernamespace = str(namespace or '').lower()
         2040             if qname and qname.find(':') > 0:
         2041                 givenprefix = qname.split(':')[0]
         2042             else:
         2043                 givenprefix = ''
         2044             prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
         2045             if prefix:
         2046                 localname = prefix + ':' + localname
         2047             elif namespace and not qname: #Expat
         2048                 for name,value in self.namespacesInUse.items():
         2049                     if name and value == namespace:
         2050                         localname = name + ':' + localname
         2051                         break
         2052             localname = str(localname).lower()
         2053             self.unknown_endtag(localname)
         2054 
         2055         def error(self, exc):
         2056             self.bozo = 1
         2057             self.exc = exc
         2058 
         2059         # drv_libxml2 calls warning() in some cases
         2060         warning = error
         2061 
         2062         def fatalError(self, exc):
         2063             self.error(exc)
         2064             raise exc
         2065 
         2066 class _BaseHTMLProcessor(sgmllib.SGMLParser):
         2067     special = re.compile('''[<>'"]''')
         2068     bare_ampersand = re.compile("&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)")
         2069     elements_no_end_tag = set([
         2070       'area', 'base', 'basefont', 'br', 'col', 'command', 'embed', 'frame',
         2071       'hr', 'img', 'input', 'isindex', 'keygen', 'link', 'meta', 'param',
         2072       'source', 'track', 'wbr'
         2073     ])
         2074 
         2075     def __init__(self, encoding, _type):
         2076         self.encoding = encoding
         2077         self._type = _type
         2078         sgmllib.SGMLParser.__init__(self)
         2079 
         2080     def reset(self):
         2081         self.pieces = []
         2082         sgmllib.SGMLParser.reset(self)
         2083 
         2084     def _shorttag_replace(self, match):
         2085         tag = match.group(1)
         2086         if tag in self.elements_no_end_tag:
         2087             return '<' + tag + ' />'
         2088         else:
         2089             return '<' + tag + '></' + tag + '>'
         2090 
         2091     # By declaring these methods and overriding their compiled code
         2092     # with the code from sgmllib, the original code will execute in
         2093     # feedparser's scope instead of sgmllib's. This means that the
         2094     # `tagfind` and `charref` regular expressions will be found as
         2095     # they're declared above, not as they're declared in sgmllib.
         2096     def goahead(self, i):
         2097         pass
         2098     goahead.func_code = sgmllib.SGMLParser.goahead.func_code
         2099 
         2100     def __parse_starttag(self, i):
         2101         pass
         2102     __parse_starttag.func_code = sgmllib.SGMLParser.parse_starttag.func_code
         2103 
         2104     def parse_starttag(self,i):
         2105         j = self.__parse_starttag(i)
         2106         if self._type == 'application/xhtml+xml':
         2107             if j>2 and self.rawdata[j-2:j]=='/>':
         2108                 self.unknown_endtag(self.lasttag)
         2109         return j
         2110 
         2111     def feed(self, data):
         2112         data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'&lt;!\1', data)
         2113         data = re.sub(r'<([^<>\s]+?)\s*/>', self._shorttag_replace, data)
         2114         data = data.replace('&#39;', "'")
         2115         data = data.replace('&#34;', '"')
         2116         try:
         2117             bytes
         2118             if bytes is str:
         2119                 raise NameError
         2120             self.encoding = self.encoding + u'_INVALID_PYTHON_3'
         2121         except NameError:
         2122             if self.encoding and isinstance(data, unicode):
         2123                 data = data.encode(self.encoding)
         2124         sgmllib.SGMLParser.feed(self, data)
         2125         sgmllib.SGMLParser.close(self)
         2126 
         2127     def normalize_attrs(self, attrs):
         2128         if not attrs:
         2129             return attrs
         2130         # utility method to be called by descendants
         2131         attrs = dict([(k.lower(), v) for k, v in attrs]).items()
         2132         attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
         2133         attrs.sort()
         2134         return attrs
         2135 
         2136     def unknown_starttag(self, tag, attrs):
         2137         # called for each start tag
         2138         # attrs is a list of (attr, value) tuples
         2139         # e.g. for <pre class='screen'>, tag='pre', attrs=[('class', 'screen')]
         2140         uattrs = []
         2141         strattrs=''
         2142         if attrs:
         2143             for key, value in attrs:
         2144                 value=value.replace('>','&gt;').replace('<','&lt;').replace('"','&quot;')
         2145                 value = self.bare_ampersand.sub("&amp;", value)
         2146                 # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds
         2147                 if not isinstance(value, unicode):
         2148                     value = value.decode(self.encoding, 'ignore')
         2149                 try:
         2150                     # Currently, in Python 3 the key is already a str, and cannot be decoded again
         2151                     uattrs.append((unicode(key, self.encoding), value))
         2152                 except TypeError:
         2153                     uattrs.append((key, value))
         2154             strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs])
         2155             if self.encoding:
         2156                 try:
         2157                     strattrs = strattrs.encode(self.encoding)
         2158                 except (UnicodeEncodeError, LookupError):
         2159                     pass
         2160         if tag in self.elements_no_end_tag:
         2161             self.pieces.append('<%s%s />' % (tag, strattrs))
         2162         else:
         2163             self.pieces.append('<%s%s>' % (tag, strattrs))
         2164 
         2165     def unknown_endtag(self, tag):
         2166         # called for each end tag, e.g. for </pre>, tag will be 'pre'
         2167         # Reconstruct the original end tag.
         2168         if tag not in self.elements_no_end_tag:
         2169             self.pieces.append("</%s>" % tag)
         2170 
         2171     def handle_charref(self, ref):
         2172         # called for each character reference, e.g. for '&#160;', ref will be '160'
         2173         # Reconstruct the original character reference.
         2174         ref = ref.lower()
         2175         if ref.startswith('x'):
         2176             value = int(ref[1:], 16)
         2177         else:
         2178             value = int(ref)
         2179 
         2180         if value in _cp1252:
         2181             self.pieces.append('&#%s;' % hex(ord(_cp1252[value]))[1:])
         2182         else:
         2183             self.pieces.append('&#%s;' % ref)
         2184 
         2185     def handle_entityref(self, ref):
         2186         # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
         2187         # Reconstruct the original entity reference.
         2188         if ref in name2codepoint or ref == 'apos':
         2189             self.pieces.append('&%s;' % ref)
         2190         else:
         2191             self.pieces.append('&amp;%s' % ref)
         2192 
         2193     def handle_data(self, text):
         2194         # called for each block of plain text, i.e. outside of any tag and
         2195         # not containing any character or entity references
         2196         # Store the original text verbatim.
         2197         self.pieces.append(text)
         2198 
         2199     def handle_comment(self, text):
         2200         # called for each HTML comment, e.g. <!-- insert Javascript code here -->
         2201         # Reconstruct the original comment.
         2202         self.pieces.append('<!--%s-->' % text)
         2203 
         2204     def handle_pi(self, text):
         2205         # called for each processing instruction, e.g. <?instruction>
         2206         # Reconstruct original processing instruction.
         2207         self.pieces.append('<?%s>' % text)
         2208 
         2209     def handle_decl(self, text):
         2210         # called for the DOCTYPE, if present, e.g.
         2211         # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
         2212         #     "http://www.w3.org/TR/html4/loose.dtd">
         2213         # Reconstruct original DOCTYPE
         2214         self.pieces.append('<!%s>' % text)
         2215 
         2216     _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
         2217     def _scan_name(self, i, declstartpos):
         2218         rawdata = self.rawdata
         2219         n = len(rawdata)
         2220         if i == n:
         2221             return None, -1
         2222         m = self._new_declname_match(rawdata, i)
         2223         if m:
         2224             s = m.group()
         2225             name = s.strip()
         2226             if (i + len(s)) == n:
         2227                 return None, -1  # end of buffer
         2228             return name.lower(), m.end()
         2229         else:
         2230             self.handle_data(rawdata)
         2231 #            self.updatepos(declstartpos, i)
         2232             return None, -1
         2233 
         2234     def convert_charref(self, name):
         2235         return '&#%s;' % name
         2236 
         2237     def convert_entityref(self, name):
         2238         return '&%s;' % name
         2239 
         2240     def output(self):
         2241         '''Return processed HTML as a single string'''
         2242         return ''.join([str(p) for p in self.pieces])
         2243 
         2244     def parse_declaration(self, i):
         2245         try:
         2246             return sgmllib.SGMLParser.parse_declaration(self, i)
         2247         except sgmllib.SGMLParseError:
         2248             # escape the doctype declaration and continue parsing
         2249             self.handle_data('&lt;')
         2250             return i+1
         2251 
         2252 class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor):
         2253     def __init__(self, baseuri, baselang, encoding, entities):
         2254         sgmllib.SGMLParser.__init__(self)
         2255         _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
         2256         _BaseHTMLProcessor.__init__(self, encoding, 'application/xhtml+xml')
         2257         self.entities=entities
         2258 
         2259     def decodeEntities(self, element, data):
         2260         data = data.replace('&#60;', '&lt;')
         2261         data = data.replace('&#x3c;', '&lt;')
         2262         data = data.replace('&#x3C;', '&lt;')
         2263         data = data.replace('&#62;', '&gt;')
         2264         data = data.replace('&#x3e;', '&gt;')
         2265         data = data.replace('&#x3E;', '&gt;')
         2266         data = data.replace('&#38;', '&amp;')
         2267         data = data.replace('&#x26;', '&amp;')
         2268         data = data.replace('&#34;', '&quot;')
         2269         data = data.replace('&#x22;', '&quot;')
         2270         data = data.replace('&#39;', '&apos;')
         2271         data = data.replace('&#x27;', '&apos;')
         2272         if not self.contentparams.get('type', u'xml').endswith(u'xml'):
         2273             data = data.replace('&lt;', '<')
         2274             data = data.replace('&gt;', '>')
         2275             data = data.replace('&amp;', '&')
         2276             data = data.replace('&quot;', '"')
         2277             data = data.replace('&apos;', "'")
         2278             data = data.replace('&#x2f;', '/')
         2279             data = data.replace('&#x2F;', '/')
         2280         return data
         2281 
         2282     def strattrs(self, attrs):
         2283         return ''.join([' %s="%s"' % (n,v.replace('"','&quot;')) for n,v in attrs])
         2284 
         2285 class _RelativeURIResolver(_BaseHTMLProcessor):
         2286     relative_uris = set([('a', 'href'),
         2287                      ('applet', 'codebase'),
         2288                      ('area', 'href'),
         2289                      ('audio', 'src'),
         2290                      ('blockquote', 'cite'),
         2291                      ('body', 'background'),
         2292                      ('del', 'cite'),
         2293                      ('form', 'action'),
         2294                      ('frame', 'longdesc'),
         2295                      ('frame', 'src'),
         2296                      ('iframe', 'longdesc'),
         2297                      ('iframe', 'src'),
         2298                      ('head', 'profile'),
         2299                      ('img', 'longdesc'),
         2300                      ('img', 'src'),
         2301                      ('img', 'usemap'),
         2302                      ('input', 'src'),
         2303                      ('input', 'usemap'),
         2304                      ('ins', 'cite'),
         2305                      ('link', 'href'),
         2306                      ('object', 'classid'),
         2307                      ('object', 'codebase'),
         2308                      ('object', 'data'),
         2309                      ('object', 'usemap'),
         2310                      ('q', 'cite'),
         2311                      ('script', 'src'),
         2312                      ('source', 'src'),
         2313                      ('video', 'poster'),
         2314                      ('video', 'src')])
         2315 
         2316     def __init__(self, baseuri, encoding, _type):
         2317         _BaseHTMLProcessor.__init__(self, encoding, _type)
         2318         self.baseuri = baseuri
         2319 
         2320     def resolveURI(self, uri):
         2321         return _makeSafeAbsoluteURI(self.baseuri, uri.strip())
         2322 
         2323     def unknown_starttag(self, tag, attrs):
         2324         attrs = self.normalize_attrs(attrs)
         2325         attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]
         2326         _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
         2327 
         2328 def _resolveRelativeURIs(htmlSource, baseURI, encoding, _type):
         2329     if not _SGML_AVAILABLE:
         2330         return htmlSource
         2331 
         2332     p = _RelativeURIResolver(baseURI, encoding, _type)
         2333     p.feed(htmlSource)
         2334     return p.output()
         2335 
         2336 def _makeSafeAbsoluteURI(base, rel=None):
         2337     # bail if ACCEPTABLE_URI_SCHEMES is empty
         2338     if not ACCEPTABLE_URI_SCHEMES:
         2339         return _urljoin(base, rel or u'')
         2340     if not base:
         2341         return rel or u''
         2342     if not rel:
         2343         try:
         2344             scheme = urlparse.urlparse(base)[0]
         2345         except ValueError:
         2346             return u''
         2347         if not scheme or scheme in ACCEPTABLE_URI_SCHEMES:
         2348             return base
         2349         return u''
         2350     uri = _urljoin(base, rel)
         2351     if uri.strip().split(':', 1)[0] not in ACCEPTABLE_URI_SCHEMES:
         2352         return u''
         2353     return uri
         2354 
         2355 class _HTMLSanitizer(_BaseHTMLProcessor):
         2356     acceptable_elements = set(['a', 'abbr', 'acronym', 'address', 'area',
         2357         'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button',
         2358         'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',
         2359         'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn',
         2360         'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset',
         2361         'figcaption', 'figure', 'footer', 'font', 'form', 'header', 'h1',
         2362         'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins',
         2363         'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter',
         2364         'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option',
         2365         'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
         2366         'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',
         2367         'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',
         2368         'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video', 'noscript'])
         2369 
         2370     acceptable_attributes = set(['abbr', 'accept', 'accept-charset', 'accesskey',
         2371       'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis',
         2372       'background', 'balance', 'bgcolor', 'bgproperties', 'border',
         2373       'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding',
         2374       'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff',
         2375       'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color', 'cols',
         2376       'colspan', 'compact', 'contenteditable', 'controls', 'coords', 'data',
         2377       'datafld', 'datapagesize', 'datasrc', 'datetime', 'default', 'delay',
         2378       'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end', 'face', 'for',
         2379       'form', 'frame', 'galleryimg', 'gutter', 'headers', 'height', 'hidefocus',
         2380       'hidden', 'high', 'href', 'hreflang', 'hspace', 'icon', 'id', 'inputmode',
         2381       'ismap', 'keytype', 'label', 'leftspacing', 'lang', 'list', 'longdesc',
         2382       'loop', 'loopcount', 'loopend', 'loopstart', 'low', 'lowsrc', 'max',
         2383       'maxlength', 'media', 'method', 'min', 'multiple', 'name', 'nohref',
         2384       'noshade', 'nowrap', 'open', 'optimum', 'pattern', 'ping', 'point-size',
         2385       'poster', 'pqg', 'preload', 'prompt', 'radiogroup', 'readonly', 'rel',
         2386       'repeat-max', 'repeat-min', 'replace', 'required', 'rev', 'rightspacing',
         2387       'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', 'span',
         2388       'src', 'start', 'step', 'summary', 'suppress', 'tabindex', 'target',
         2389       'template', 'title', 'toppadding', 'type', 'unselectable', 'usemap',
         2390       'urn', 'valign', 'value', 'variable', 'volume', 'vspace', 'vrml',
         2391       'width', 'wrap', 'xml:lang'])
         2392 
         2393     unacceptable_elements_with_end_tag = set(['script', 'applet', 'style'])
         2394 
         2395     acceptable_css_properties = set(['azimuth', 'background-color',
         2396       'border-bottom-color', 'border-collapse', 'border-color',
         2397       'border-left-color', 'border-right-color', 'border-top-color', 'clear',
         2398       'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
         2399       'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
         2400       'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
         2401       'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
         2402       'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
         2403       'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
         2404       'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
         2405       'white-space', 'width'])
         2406 
         2407     # survey of common keywords found in feeds
         2408     acceptable_css_keywords = set(['auto', 'aqua', 'black', 'block', 'blue',
         2409       'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
         2410       'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
         2411       'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
         2412       'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
         2413       'transparent', 'underline', 'white', 'yellow'])
         2414 
         2415     valid_css_values = re.compile('^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|' +
         2416       '\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$')
         2417 
         2418     mathml_elements = set([
         2419         'annotation',
         2420         'annotation-xml',
         2421         'maction',
         2422         'maligngroup',
         2423         'malignmark',
         2424         'math',
         2425         'menclose',
         2426         'merror',
         2427         'mfenced',
         2428         'mfrac',
         2429         'mglyph',
         2430         'mi',
         2431         'mlabeledtr',
         2432         'mlongdiv',
         2433         'mmultiscripts',
         2434         'mn',
         2435         'mo',
         2436         'mover',
         2437         'mpadded',
         2438         'mphantom',
         2439         'mprescripts',
         2440         'mroot',
         2441         'mrow',
         2442         'ms',
         2443         'mscarries',
         2444         'mscarry',
         2445         'msgroup',
         2446         'msline',
         2447         'mspace',
         2448         'msqrt',
         2449         'msrow',
         2450         'mstack',
         2451         'mstyle',
         2452         'msub',
         2453         'msubsup',
         2454         'msup',
         2455         'mtable',
         2456         'mtd',
         2457         'mtext',
         2458         'mtr',
         2459         'munder',
         2460         'munderover',
         2461         'none',
         2462         'semantics',
         2463     ])
         2464 
         2465     mathml_attributes = set([
         2466         'accent',
         2467         'accentunder',
         2468         'actiontype',
         2469         'align',
         2470         'alignmentscope',
         2471         'altimg',
         2472         'altimg-height',
         2473         'altimg-valign',
         2474         'altimg-width',
         2475         'alttext',
         2476         'bevelled',
         2477         'charalign',
         2478         'close',
         2479         'columnalign',
         2480         'columnlines',
         2481         'columnspacing',
         2482         'columnspan',
         2483         'columnwidth',
         2484         'crossout',
         2485         'decimalpoint',
         2486         'denomalign',
         2487         'depth',
         2488         'dir',
         2489         'display',
         2490         'displaystyle',
         2491         'edge',
         2492         'encoding',
         2493         'equalcolumns',
         2494         'equalrows',
         2495         'fence',
         2496         'fontstyle',
         2497         'fontweight',
         2498         'form',
         2499         'frame',
         2500         'framespacing',
         2501         'groupalign',
         2502         'height',
         2503         'href',
         2504         'id',
         2505         'indentalign',
         2506         'indentalignfirst',
         2507         'indentalignlast',
         2508         'indentshift',
         2509         'indentshiftfirst',
         2510         'indentshiftlast',
         2511         'indenttarget',
         2512         'infixlinebreakstyle',
         2513         'largeop',
         2514         'length',
         2515         'linebreak',
         2516         'linebreakmultchar',
         2517         'linebreakstyle',
         2518         'lineleading',
         2519         'linethickness',
         2520         'location',
         2521         'longdivstyle',
         2522         'lquote',
         2523         'lspace',
         2524         'mathbackground',
         2525         'mathcolor',
         2526         'mathsize',
         2527         'mathvariant',
         2528         'maxsize',
         2529         'minlabelspacing',
         2530         'minsize',
         2531         'movablelimits',
         2532         'notation',
         2533         'numalign',
         2534         'open',
         2535         'other',
         2536         'overflow',
         2537         'position',
         2538         'rowalign',
         2539         'rowlines',
         2540         'rowspacing',
         2541         'rowspan',
         2542         'rquote',
         2543         'rspace',
         2544         'scriptlevel',
         2545         'scriptminsize',
         2546         'scriptsizemultiplier',
         2547         'selection',
         2548         'separator',
         2549         'separators',
         2550         'shift',
         2551         'side',
         2552         'src',
         2553         'stackalign',
         2554         'stretchy',
         2555         'subscriptshift',
         2556         'superscriptshift',
         2557         'symmetric',
         2558         'voffset',
         2559         'width',
         2560         'xlink:href',
         2561         'xlink:show',
         2562         'xlink:type',
         2563         'xmlns',
         2564         'xmlns:xlink',
         2565     ])
         2566 
         2567     # svgtiny - foreignObject + linearGradient + radialGradient + stop
         2568     svg_elements = set(['a', 'animate', 'animateColor', 'animateMotion',
         2569       'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'foreignObject',
         2570       'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern',
         2571       'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph', 'mpath',
         2572       'path', 'polygon', 'polyline', 'radialGradient', 'rect', 'set', 'stop',
         2573       'svg', 'switch', 'text', 'title', 'tspan', 'use'])
         2574 
         2575     # svgtiny + class + opacity + offset + xmlns + xmlns:xlink
         2576     svg_attributes = set(['accent-height', 'accumulate', 'additive', 'alphabetic',
         2577        'arabic-form', 'ascent', 'attributeName', 'attributeType',
         2578        'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
         2579        'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx',
         2580        'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-opacity',
         2581        'fill-rule', 'font-family', 'font-size', 'font-stretch', 'font-style',
         2582        'font-variant', 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2',
         2583        'glyph-name', 'gradientUnits', 'hanging', 'height', 'horiz-adv-x',
         2584        'horiz-origin-x', 'id', 'ideographic', 'k', 'keyPoints', 'keySplines',
         2585        'keyTimes', 'lang', 'mathematical', 'marker-end', 'marker-mid',
         2586        'marker-start', 'markerHeight', 'markerUnits', 'markerWidth', 'max',
         2587        'min', 'name', 'offset', 'opacity', 'orient', 'origin',
         2588        'overline-position', 'overline-thickness', 'panose-1', 'path',
         2589        'pathLength', 'points', 'preserveAspectRatio', 'r', 'refX', 'refY',
         2590        'repeatCount', 'repeatDur', 'requiredExtensions', 'requiredFeatures',
         2591        'restart', 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv',
         2592        'stop-color', 'stop-opacity', 'strikethrough-position',
         2593        'strikethrough-thickness', 'stroke', 'stroke-dasharray',
         2594        'stroke-dashoffset', 'stroke-linecap', 'stroke-linejoin',
         2595        'stroke-miterlimit', 'stroke-opacity', 'stroke-width', 'systemLanguage',
         2596        'target', 'text-anchor', 'to', 'transform', 'type', 'u1', 'u2',
         2597        'underline-position', 'underline-thickness', 'unicode', 'unicode-range',
         2598        'units-per-em', 'values', 'version', 'viewBox', 'visibility', 'width',
         2599        'widths', 'x', 'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
         2600        'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type',
         2601        'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1',
         2602        'y2', 'zoomAndPan'])
         2603 
         2604     svg_attr_map = None
         2605     svg_elem_map = None
         2606 
         2607     acceptable_svg_properties = set([ 'fill', 'fill-opacity', 'fill-rule',
         2608       'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
         2609       'stroke-opacity'])
         2610 
         2611     def reset(self):
         2612         _BaseHTMLProcessor.reset(self)
         2613         self.unacceptablestack = 0
         2614         self.mathmlOK = 0
         2615         self.svgOK = 0
         2616 
         2617     def unknown_starttag(self, tag, attrs):
         2618         acceptable_attributes = self.acceptable_attributes
         2619         keymap = {}
         2620         if not tag in self.acceptable_elements or self.svgOK:
         2621             if tag in self.unacceptable_elements_with_end_tag:
         2622                 self.unacceptablestack += 1
         2623 
         2624             # add implicit namespaces to html5 inline svg/mathml
         2625             if self._type.endswith('html'):
         2626                 if not dict(attrs).get('xmlns'):
         2627                     if tag=='svg':
         2628                         attrs.append( ('xmlns','http://www.w3.org/2000/svg') )
         2629                     if tag=='math':
         2630                         attrs.append( ('xmlns','http://www.w3.org/1998/Math/MathML') )
         2631 
         2632             # not otherwise acceptable, perhaps it is MathML or SVG?
         2633             if tag=='math' and ('xmlns','http://www.w3.org/1998/Math/MathML') in attrs:
         2634                 self.mathmlOK += 1
         2635             if tag=='svg' and ('xmlns','http://www.w3.org/2000/svg') in attrs:
         2636                 self.svgOK += 1
         2637 
         2638             # chose acceptable attributes based on tag class, else bail
         2639             if  self.mathmlOK and tag in self.mathml_elements:
         2640                 acceptable_attributes = self.mathml_attributes
         2641             elif self.svgOK and tag in self.svg_elements:
         2642                 # for most vocabularies, lowercasing is a good idea.  Many
         2643                 # svg elements, however, are camel case
         2644                 if not self.svg_attr_map:
         2645                     lower=[attr.lower() for attr in self.svg_attributes]
         2646                     mix=[a for a in self.svg_attributes if a not in lower]
         2647                     self.svg_attributes = lower
         2648                     self.svg_attr_map = dict([(a.lower(),a) for a in mix])
         2649 
         2650                     lower=[attr.lower() for attr in self.svg_elements]
         2651                     mix=[a for a in self.svg_elements if a not in lower]
         2652                     self.svg_elements = lower
         2653                     self.svg_elem_map = dict([(a.lower(),a) for a in mix])
         2654                 acceptable_attributes = self.svg_attributes
         2655                 tag = self.svg_elem_map.get(tag,tag)
         2656                 keymap = self.svg_attr_map
         2657             elif not tag in self.acceptable_elements:
         2658                 return
         2659 
         2660         # declare xlink namespace, if needed
         2661         if self.mathmlOK or self.svgOK:
         2662             if filter(lambda (n,v): n.startswith('xlink:'),attrs):
         2663                 if not ('xmlns:xlink','http://www.w3.org/1999/xlink') in attrs:
         2664                     attrs.append(('xmlns:xlink','http://www.w3.org/1999/xlink'))
         2665 
         2666         clean_attrs = []
         2667         for key, value in self.normalize_attrs(attrs):
         2668             if key in acceptable_attributes:
         2669                 key=keymap.get(key,key)
         2670                 # make sure the uri uses an acceptable uri scheme
         2671                 if key == u'href':
         2672                     value = _makeSafeAbsoluteURI(value)
         2673                 clean_attrs.append((key,value))
         2674             elif key=='style':
         2675                 clean_value = self.sanitize_style(value)
         2676                 if clean_value:
         2677                     clean_attrs.append((key,clean_value))
         2678         _BaseHTMLProcessor.unknown_starttag(self, tag, clean_attrs)
         2679 
         2680     def unknown_endtag(self, tag):
         2681         if not tag in self.acceptable_elements:
         2682             if tag in self.unacceptable_elements_with_end_tag:
         2683                 self.unacceptablestack -= 1
         2684             if self.mathmlOK and tag in self.mathml_elements:
         2685                 if tag == 'math' and self.mathmlOK:
         2686                     self.mathmlOK -= 1
         2687             elif self.svgOK and tag in self.svg_elements:
         2688                 tag = self.svg_elem_map.get(tag,tag)
         2689                 if tag == 'svg' and self.svgOK:
         2690                     self.svgOK -= 1
         2691             else:
         2692                 return
         2693         _BaseHTMLProcessor.unknown_endtag(self, tag)
         2694 
         2695     def handle_pi(self, text):
         2696         pass
         2697 
         2698     def handle_decl(self, text):
         2699         pass
         2700 
         2701     def handle_data(self, text):
         2702         if not self.unacceptablestack:
         2703             _BaseHTMLProcessor.handle_data(self, text)
         2704 
         2705     def sanitize_style(self, style):
         2706         # disallow urls
         2707         style=re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ',style)
         2708 
         2709         # gauntlet
         2710         if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
         2711             return ''
         2712         # This replaced a regexp that used re.match and was prone to pathological back-tracking.
         2713         if re.sub("\s*[-\w]+\s*:\s*[^:;]*;?", '', style).strip():
         2714             return ''
         2715 
         2716         clean = []
         2717         for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style):
         2718             if not value:
         2719                 continue
         2720             if prop.lower() in self.acceptable_css_properties:
         2721                 clean.append(prop + ': ' + value + ';')
         2722             elif prop.split('-')[0].lower() in ['background','border','margin','padding']:
         2723                 for keyword in value.split():
         2724                     if not keyword in self.acceptable_css_keywords and \
         2725                         not self.valid_css_values.match(keyword):
         2726                         break
         2727                 else:
         2728                     clean.append(prop + ': ' + value + ';')
         2729             elif self.svgOK and prop.lower() in self.acceptable_svg_properties:
         2730                 clean.append(prop + ': ' + value + ';')
         2731 
         2732         return ' '.join(clean)
         2733 
         2734     def parse_comment(self, i, report=1):
         2735         ret = _BaseHTMLProcessor.parse_comment(self, i, report)
         2736         if ret >= 0:
         2737             return ret
         2738         # if ret == -1, this may be a malicious attempt to circumvent
         2739         # sanitization, or a page-destroying unclosed comment
         2740         match = re.compile(r'--[^>]*>').search(self.rawdata, i+4)
         2741         if match:
         2742             return match.end()
         2743         # unclosed comment; deliberately fail to handle_data()
         2744         return len(self.rawdata)
         2745 
         2746 
         2747 def _sanitizeHTML(htmlSource, encoding, _type):
         2748     if not _SGML_AVAILABLE:
         2749         return htmlSource
         2750     p = _HTMLSanitizer(encoding, _type)
         2751     htmlSource = htmlSource.replace('<![CDATA[', '&lt;![CDATA[')
         2752     p.feed(htmlSource)
         2753     data = p.output()
         2754     data = data.strip().replace('\r\n', '\n')
         2755     return data
         2756 
         2757 class _FeedURLHandler(urllib2.HTTPDigestAuthHandler, urllib2.HTTPRedirectHandler, urllib2.HTTPDefaultErrorHandler):
         2758     def http_error_default(self, req, fp, code, msg, headers):
         2759         # The default implementation just raises HTTPError.
         2760         # Forget that.
         2761         fp.status = code
         2762         return fp
         2763 
         2764     def http_error_301(self, req, fp, code, msg, hdrs):
         2765         result = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp,
         2766                                                             code, msg, hdrs)
         2767         result.status = code
         2768         result.newurl = result.geturl()
         2769         return result
         2770     # The default implementations in urllib2.HTTPRedirectHandler
         2771     # are identical, so hardcoding a http_error_301 call above
         2772     # won't affect anything
         2773     http_error_300 = http_error_301
         2774     http_error_302 = http_error_301
         2775     http_error_303 = http_error_301
         2776     http_error_307 = http_error_301
         2777 
         2778     def http_error_401(self, req, fp, code, msg, headers):
         2779         # Check if
         2780         # - server requires digest auth, AND
         2781         # - we tried (unsuccessfully) with basic auth, AND
         2782         # If all conditions hold, parse authentication information
         2783         # out of the Authorization header we sent the first time
         2784         # (for the username and password) and the WWW-Authenticate
         2785         # header the server sent back (for the realm) and retry
         2786         # the request with the appropriate digest auth headers instead.
         2787         # This evil genius hack has been brought to you by Aaron Swartz.
         2788         host = urlparse.urlparse(req.get_full_url())[1]
         2789         if base64 is None or 'Authorization' not in req.headers \
         2790                           or 'WWW-Authenticate' not in headers:
         2791             return self.http_error_default(req, fp, code, msg, headers)
         2792         auth = _base64decode(req.headers['Authorization'].split(' ')[1])
         2793         user, passw = auth.split(':')
         2794         realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0]
         2795         self.add_password(realm, host, user, passw)
         2796         retry = self.http_error_auth_reqed('www-authenticate', host, req, headers)
         2797         self.reset_retry_count()
         2798         return retry
         2799 
         2800 def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers):
         2801     """URL, filename, or string --> stream
         2802 
         2803     This function lets you define parsers that take any input source
         2804     (URL, pathname to local or network file, or actual data as a string)
         2805     and deal with it in a uniform manner.  Returned object is guaranteed
         2806     to have all the basic stdio read methods (read, readline, readlines).
         2807     Just .close() the object when you're done with it.
         2808 
         2809     If the etag argument is supplied, it will be used as the value of an
         2810     If-None-Match request header.
         2811 
         2812     If the modified argument is supplied, it can be a tuple of 9 integers
         2813     (as returned by gmtime() in the standard Python time module) or a date
         2814     string in any format supported by feedparser. Regardless, it MUST
         2815     be in GMT (Greenwich Mean Time). It will be reformatted into an
         2816     RFC 1123-compliant date and used as the value of an If-Modified-Since
         2817     request header.
         2818 
         2819     If the agent argument is supplied, it will be used as the value of a
         2820     User-Agent request header.
         2821 
         2822     If the referrer argument is supplied, it will be used as the value of a
         2823     Referer[sic] request header.
         2824 
         2825     If handlers is supplied, it is a list of handlers used to build a
         2826     urllib2 opener.
         2827 
         2828     if request_headers is supplied it is a dictionary of HTTP request headers
         2829     that will override the values generated by FeedParser.
         2830 
         2831     :return: A :class:`StringIO.StringIO` or :class:`io.BytesIO`.
         2832     """
         2833 
         2834     if hasattr(url_file_stream_or_string, 'read'):
         2835         return url_file_stream_or_string
         2836 
         2837     if isinstance(url_file_stream_or_string, basestring) \
         2838        and urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp', 'file', 'feed'):
         2839         # Deal with the feed URI scheme
         2840         if url_file_stream_or_string.startswith('feed:http'):
         2841             url_file_stream_or_string = url_file_stream_or_string[5:]
         2842         elif url_file_stream_or_string.startswith('feed:'):
         2843             url_file_stream_or_string = 'http:' + url_file_stream_or_string[5:]
         2844         if not agent:
         2845             agent = USER_AGENT
         2846         # Test for inline user:password credentials for HTTP basic auth
         2847         auth = None
         2848         if base64 and not url_file_stream_or_string.startswith('ftp:'):
         2849             urltype, rest = urllib.splittype(url_file_stream_or_string)
         2850             realhost, rest = urllib.splithost(rest)
         2851             if realhost:
         2852                 user_passwd, realhost = urllib.splituser(realhost)
         2853                 if user_passwd:
         2854                     url_file_stream_or_string = '%s://%s%s' % (urltype, realhost, rest)
         2855                     auth = base64.standard_b64encode(user_passwd).strip()
         2856 
         2857         # iri support
         2858         if isinstance(url_file_stream_or_string, unicode):
         2859             url_file_stream_or_string = _convert_to_idn(url_file_stream_or_string)
         2860 
         2861         # try to open with urllib2 (to use optional headers)
         2862         request = _build_urllib2_request(url_file_stream_or_string, agent, etag, modified, referrer, auth, request_headers)
         2863         opener = urllib2.build_opener(*tuple(handlers + [_FeedURLHandler()]))
         2864         opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
         2865         try:
         2866             return opener.open(request)
         2867         finally:
         2868             opener.close() # JohnD
         2869 
         2870     # try to open with native open function (if url_file_stream_or_string is a filename)
         2871     try:
         2872         return open(url_file_stream_or_string, 'rb')
         2873     except (IOError, UnicodeEncodeError, TypeError):
         2874         # if url_file_stream_or_string is a unicode object that
         2875         # cannot be converted to the encoding returned by
         2876         # sys.getfilesystemencoding(), a UnicodeEncodeError
         2877         # will be thrown
         2878         # If url_file_stream_or_string is a string that contains NULL
         2879         # (such as an XML document encoded in UTF-32), TypeError will
         2880         # be thrown.
         2881         pass
         2882 
         2883     # treat url_file_stream_or_string as string
         2884     if isinstance(url_file_stream_or_string, unicode):
         2885         return _StringIO(url_file_stream_or_string.encode('utf-8'))
         2886     return _StringIO(url_file_stream_or_string)
         2887 
         2888 def _convert_to_idn(url):
         2889     """Convert a URL to IDN notation"""
         2890     # this function should only be called with a unicode string
         2891     # strategy: if the host cannot be encoded in ascii, then
         2892     # it'll be necessary to encode it in idn form
         2893     parts = list(urlparse.urlsplit(url))
         2894     try:
         2895         parts[1].encode('ascii')
         2896     except UnicodeEncodeError:
         2897         # the url needs to be converted to idn notation
         2898         host = parts[1].rsplit(':', 1)
         2899         newhost = []
         2900         port = u''
         2901         if len(host) == 2:
         2902             port = host.pop()
         2903         for h in host[0].split('.'):
         2904             newhost.append(h.encode('idna').decode('utf-8'))
         2905         parts[1] = '.'.join(newhost)
         2906         if port:
         2907             parts[1] += ':' + port
         2908         return urlparse.urlunsplit(parts)
         2909     else:
         2910         return url
         2911 
         2912 def _build_urllib2_request(url, agent, etag, modified, referrer, auth, request_headers):
         2913     request = urllib2.Request(url)
         2914     request.add_header('User-Agent', agent)
         2915     if etag:
         2916         request.add_header('If-None-Match', etag)
         2917     if isinstance(modified, basestring):
         2918         modified = _parse_date(modified)
         2919     elif isinstance(modified, datetime.datetime):
         2920         modified = modified.utctimetuple()
         2921     if modified:
         2922         # format into an RFC 1123-compliant timestamp. We can't use
         2923         # time.strftime() since the %a and %b directives can be affected
         2924         # by the current locale, but RFC 2616 states that dates must be
         2925         # in English.
         2926         short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
         2927         months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
         2928         request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))
         2929     if referrer:
         2930         request.add_header('Referer', referrer)
         2931     if gzip and zlib:
         2932         request.add_header('Accept-encoding', 'gzip, deflate')
         2933     elif gzip:
         2934         request.add_header('Accept-encoding', 'gzip')
         2935     elif zlib:
         2936         request.add_header('Accept-encoding', 'deflate')
         2937     else:
         2938         request.add_header('Accept-encoding', '')
         2939     if auth:
         2940         request.add_header('Authorization', 'Basic %s' % auth)
         2941     if ACCEPT_HEADER:
         2942         request.add_header('Accept', ACCEPT_HEADER)
         2943     # use this for whatever -- cookies, special headers, etc
         2944     # [('Cookie','Something'),('x-special-header','Another Value')]
         2945     for header_name, header_value in request_headers.items():
         2946         request.add_header(header_name, header_value)
         2947     request.add_header('A-IM', 'feed') # RFC 3229 support
         2948     return request
         2949 
         2950 def _parse_psc_chapter_start(start):
         2951     FORMAT = r'^((\d{2}):)?(\d{2}):(\d{2})(\.(\d{3}))?$'
         2952 
         2953     m = re.compile(FORMAT).match(start)
         2954     if m is None:
         2955         return None
         2956 
         2957     _, h, m, s, _, ms = m.groups()
         2958     h, m, s, ms = (int(h or 0), int(m), int(s), int(ms or 0))
         2959     return datetime.timedelta(0, h*60*60 + m*60 + s, ms*1000)
         2960 
         2961 _date_handlers = []
         2962 def registerDateHandler(func):
         2963     '''Register a date handler function (takes string, returns 9-tuple date in GMT)'''
         2964     _date_handlers.insert(0, func)
         2965 
         2966 # ISO-8601 date parsing routines written by Fazal Majid.
         2967 # The ISO 8601 standard is very convoluted and irregular - a full ISO 8601
         2968 # parser is beyond the scope of feedparser and would be a worthwhile addition
         2969 # to the Python library.
         2970 # A single regular expression cannot parse ISO 8601 date formats into groups
         2971 # as the standard is highly irregular (for instance is 030104 2003-01-04 or
         2972 # 0301-04-01), so we use templates instead.
         2973 # Please note the order in templates is significant because we need a
         2974 # greedy match.
         2975 _iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-0MM?-?DD', 'YYYY-MM', 'YYYY-?OOO',
         2976                 'YY-?MM-?DD', 'YY-?OOO', 'YYYY',
         2977                 '-YY-?MM', '-OOO', '-YY',
         2978                 '--MM-?DD', '--MM',
         2979                 '---DD',
         2980                 'CC', '']
         2981 _iso8601_re = [
         2982     tmpl.replace(
         2983     'YYYY', r'(?P<year>\d{4})').replace(
         2984     'YY', r'(?P<year>\d\d)').replace(
         2985     'MM', r'(?P<month>[01]\d)').replace(
         2986     'DD', r'(?P<day>[0123]\d)').replace(
         2987     'OOO', r'(?P<ordinal>[0123]\d\d)').replace(
         2988     'CC', r'(?P<century>\d\d$)')
         2989     + r'(T?(?P<hour>\d{2}):(?P<minute>\d{2})'
         2990     + r'(:(?P<second>\d{2}))?'
         2991     + r'(\.(?P<fracsecond>\d+))?'
         2992     + r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?'
         2993     for tmpl in _iso8601_tmpl]
         2994 try:
         2995     del tmpl
         2996 except NameError:
         2997     pass
         2998 _iso8601_matches = [re.compile(regex).match for regex in _iso8601_re]
         2999 try:
         3000     del regex
         3001 except NameError:
         3002     pass
         3003 
         3004 def _parse_date_iso8601(dateString):
         3005     '''Parse a variety of ISO-8601-compatible formats like 20040105'''
         3006     m = None
         3007     for _iso8601_match in _iso8601_matches:
         3008         m = _iso8601_match(dateString)
         3009         if m:
         3010             break
         3011     if not m:
         3012         return
         3013     if m.span() == (0, 0):
         3014         return
         3015     params = m.groupdict()
         3016     ordinal = params.get('ordinal', 0)
         3017     if ordinal:
         3018         ordinal = int(ordinal)
         3019     else:
         3020         ordinal = 0
         3021     year = params.get('year', '--')
         3022     if not year or year == '--':
         3023         year = time.gmtime()[0]
         3024     elif len(year) == 2:
         3025         # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993
         3026         year = 100 * int(time.gmtime()[0] / 100) + int(year)
         3027     else:
         3028         year = int(year)
         3029     month = params.get('month', '-')
         3030     if not month or month == '-':
         3031         # ordinals are NOT normalized by mktime, we simulate them
         3032         # by setting month=1, day=ordinal
         3033         if ordinal:
         3034             month = 1
         3035         else:
         3036             month = time.gmtime()[1]
         3037     month = int(month)
         3038     day = params.get('day', 0)
         3039     if not day:
         3040         # see above
         3041         if ordinal:
         3042             day = ordinal
         3043         elif params.get('century', 0) or \
         3044                  params.get('year', 0) or params.get('month', 0):
         3045             day = 1
         3046         else:
         3047             day = time.gmtime()[2]
         3048     else:
         3049         day = int(day)
         3050     # special case of the century - is the first year of the 21st century
         3051     # 2000 or 2001 ? The debate goes on...
         3052     if 'century' in params:
         3053         year = (int(params['century']) - 1) * 100 + 1
         3054     # in ISO 8601 most fields are optional
         3055     for field in ['hour', 'minute', 'second', 'tzhour', 'tzmin']:
         3056         if not params.get(field, None):
         3057             params[field] = 0
         3058     hour = int(params.get('hour', 0))
         3059     minute = int(params.get('minute', 0))
         3060     second = int(float(params.get('second', 0)))
         3061     # weekday is normalized by mktime(), we can ignore it
         3062     weekday = 0
         3063     daylight_savings_flag = -1
         3064     tm = [year, month, day, hour, minute, second, weekday,
         3065           ordinal, daylight_savings_flag]
         3066     # ISO 8601 time zone adjustments
         3067     tz = params.get('tz')
         3068     if tz and tz != 'Z':
         3069         if tz[0] == '-':
         3070             tm[3] += int(params.get('tzhour', 0))
         3071             tm[4] += int(params.get('tzmin', 0))
         3072         elif tz[0] == '+':
         3073             tm[3] -= int(params.get('tzhour', 0))
         3074             tm[4] -= int(params.get('tzmin', 0))
         3075         else:
         3076             return None
         3077     # Python's time.mktime() is a wrapper around the ANSI C mktime(3c)
         3078     # which is guaranteed to normalize d/m/y/h/m/s.
         3079     # Many implementations have bugs, but we'll pretend they don't.
         3080     return time.localtime(time.mktime(tuple(tm)))
         3081 registerDateHandler(_parse_date_iso8601)
         3082 
         3083 # 8-bit date handling routines written by ytrewq1.
         3084 _korean_year  = u'\ub144' # b3e2 in euc-kr
         3085 _korean_month = u'\uc6d4' # bff9 in euc-kr
         3086 _korean_day   = u'\uc77c' # c0cf in euc-kr
         3087 _korean_am    = u'\uc624\uc804' # bfc0 c0fc in euc-kr
         3088 _korean_pm    = u'\uc624\ud6c4' # bfc0 c8c4 in euc-kr
         3089 
         3090 _korean_onblog_date_re = \
         3091     re.compile('(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' % \
         3092                (_korean_year, _korean_month, _korean_day))
         3093 _korean_nate_date_re = \
         3094     re.compile(u'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \
         3095                (_korean_am, _korean_pm))
         3096 def _parse_date_onblog(dateString):
         3097     '''Parse a string according to the OnBlog 8-bit date format'''
         3098     m = _korean_onblog_date_re.match(dateString)
         3099     if not m:
         3100         return
         3101     w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
         3102                 {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
         3103                  'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
         3104                  'zonediff': '+09:00'}
         3105     return _parse_date_w3dtf(w3dtfdate)
         3106 registerDateHandler(_parse_date_onblog)
         3107 
         3108 def _parse_date_nate(dateString):
         3109     '''Parse a string according to the Nate 8-bit date format'''
         3110     m = _korean_nate_date_re.match(dateString)
         3111     if not m:
         3112         return
         3113     hour = int(m.group(5))
         3114     ampm = m.group(4)
         3115     if (ampm == _korean_pm):
         3116         hour += 12
         3117     hour = str(hour)
         3118     if len(hour) == 1:
         3119         hour = '0' + hour
         3120     w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
         3121                 {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
         3122                  'hour': hour, 'minute': m.group(6), 'second': m.group(7),\
         3123                  'zonediff': '+09:00'}
         3124     return _parse_date_w3dtf(w3dtfdate)
         3125 registerDateHandler(_parse_date_nate)
         3126 
         3127 # Unicode strings for Greek date strings
         3128 _greek_months = \
         3129   { \
         3130    u'\u0399\u03b1\u03bd': u'Jan',       # c9e1ed in iso-8859-7
         3131    u'\u03a6\u03b5\u03b2': u'Feb',       # d6e5e2 in iso-8859-7
         3132    u'\u039c\u03ac\u03ce': u'Mar',       # ccdcfe in iso-8859-7
         3133    u'\u039c\u03b1\u03ce': u'Mar',       # cce1fe in iso-8859-7
         3134    u'\u0391\u03c0\u03c1': u'Apr',       # c1f0f1 in iso-8859-7
         3135    u'\u039c\u03ac\u03b9': u'May',       # ccdce9 in iso-8859-7
         3136    u'\u039c\u03b1\u03ca': u'May',       # cce1fa in iso-8859-7
         3137    u'\u039c\u03b1\u03b9': u'May',       # cce1e9 in iso-8859-7
         3138    u'\u0399\u03bf\u03cd\u03bd': u'Jun', # c9effded in iso-8859-7
         3139    u'\u0399\u03bf\u03bd': u'Jun',       # c9efed in iso-8859-7
         3140    u'\u0399\u03bf\u03cd\u03bb': u'Jul', # c9effdeb in iso-8859-7
         3141    u'\u0399\u03bf\u03bb': u'Jul',       # c9f9eb in iso-8859-7
         3142    u'\u0391\u03cd\u03b3': u'Aug',       # c1fde3 in iso-8859-7
         3143    u'\u0391\u03c5\u03b3': u'Aug',       # c1f5e3 in iso-8859-7
         3144    u'\u03a3\u03b5\u03c0': u'Sep',       # d3e5f0 in iso-8859-7
         3145    u'\u039f\u03ba\u03c4': u'Oct',       # cfeaf4 in iso-8859-7
         3146    u'\u039d\u03bf\u03ad': u'Nov',       # cdefdd in iso-8859-7
         3147    u'\u039d\u03bf\u03b5': u'Nov',       # cdefe5 in iso-8859-7
         3148    u'\u0394\u03b5\u03ba': u'Dec',       # c4e5ea in iso-8859-7
         3149   }
         3150 
         3151 _greek_wdays = \
         3152   { \
         3153    u'\u039a\u03c5\u03c1': u'Sun', # caf5f1 in iso-8859-7
         3154    u'\u0394\u03b5\u03c5': u'Mon', # c4e5f5 in iso-8859-7
         3155    u'\u03a4\u03c1\u03b9': u'Tue', # d4f1e9 in iso-8859-7
         3156    u'\u03a4\u03b5\u03c4': u'Wed', # d4e5f4 in iso-8859-7
         3157    u'\u03a0\u03b5\u03bc': u'Thu', # d0e5ec in iso-8859-7
         3158    u'\u03a0\u03b1\u03c1': u'Fri', # d0e1f1 in iso-8859-7
         3159    u'\u03a3\u03b1\u03b2': u'Sat', # d3e1e2 in iso-8859-7
         3160   }
         3161 
         3162 _greek_date_format_re = \
         3163     re.compile(u'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)')
         3164 
         3165 def _parse_date_greek(dateString):
         3166     '''Parse a string according to a Greek 8-bit date format.'''
         3167     m = _greek_date_format_re.match(dateString)
         3168     if not m:
         3169         return
         3170     wday = _greek_wdays[m.group(1)]
         3171     month = _greek_months[m.group(3)]
         3172     rfc822date = '%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s' % \
         3173                  {'wday': wday, 'day': m.group(2), 'month': month, 'year': m.group(4),\
         3174                   'hour': m.group(5), 'minute': m.group(6), 'second': m.group(7),\
         3175                   'zonediff': m.group(8)}
         3176     return _parse_date_rfc822(rfc822date)
         3177 registerDateHandler(_parse_date_greek)
         3178 
         3179 # Unicode strings for Hungarian date strings
         3180 _hungarian_months = \
         3181   { \
         3182     u'janu\u00e1r':   u'01',  # e1 in iso-8859-2
         3183     u'febru\u00e1ri': u'02',  # e1 in iso-8859-2
         3184     u'm\u00e1rcius':  u'03',  # e1 in iso-8859-2
         3185     u'\u00e1prilis':  u'04',  # e1 in iso-8859-2
         3186     u'm\u00e1ujus':   u'05',  # e1 in iso-8859-2
         3187     u'j\u00fanius':   u'06',  # fa in iso-8859-2
         3188     u'j\u00falius':   u'07',  # fa in iso-8859-2
         3189     u'augusztus':     u'08',
         3190     u'szeptember':    u'09',
         3191     u'okt\u00f3ber':  u'10',  # f3 in iso-8859-2
         3192     u'november':      u'11',
         3193     u'december':      u'12',
         3194   }
         3195 
         3196 _hungarian_date_format_re = \
         3197   re.compile(u'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))')
         3198 
         3199 def _parse_date_hungarian(dateString):
         3200     '''Parse a string according to a Hungarian 8-bit date format.'''
         3201     m = _hungarian_date_format_re.match(dateString)
         3202     if not m or m.group(2) not in _hungarian_months:
         3203         return None
         3204     month = _hungarian_months[m.group(2)]
         3205     day = m.group(3)
         3206     if len(day) == 1:
         3207         day = '0' + day
         3208     hour = m.group(4)
         3209     if len(hour) == 1:
         3210         hour = '0' + hour
         3211     w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s' % \
         3212                 {'year': m.group(1), 'month': month, 'day': day,\
         3213                  'hour': hour, 'minute': m.group(5),\
         3214                  'zonediff': m.group(6)}
         3215     return _parse_date_w3dtf(w3dtfdate)
         3216 registerDateHandler(_parse_date_hungarian)
         3217 
         3218 timezonenames = {
         3219     'ut': 0, 'gmt': 0, 'z': 0,
         3220     'adt': -3, 'ast': -4, 'at': -4,
         3221     'edt': -4, 'est': -5, 'et': -5,
         3222     'cdt': -5, 'cst': -6, 'ct': -6,
         3223     'mdt': -6, 'mst': -7, 'mt': -7,
         3224     'pdt': -7, 'pst': -8, 'pt': -8,
         3225     'a': -1, 'n': 1,
         3226     'm': -12, 'y': 12,
         3227 }
         3228 # W3 date and time format parser
         3229 # http://www.w3.org/TR/NOTE-datetime
         3230 # Also supports MSSQL-style datetimes as defined at:
         3231 # http://msdn.microsoft.com/en-us/library/ms186724.aspx
         3232 # (basically, allow a space as a date/time/timezone separator)
         3233 def _parse_date_w3dtf(datestr):
         3234     if not datestr.strip():
         3235         return None
         3236     parts = datestr.lower().split('t')
         3237     if len(parts) == 1:
         3238         # This may be a date only, or may be an MSSQL-style date
         3239         parts = parts[0].split()
         3240         if len(parts) == 1:
         3241             # Treat this as a date only
         3242             parts.append('00:00:00z')
         3243     elif len(parts) > 2:
         3244         return None
         3245     date = parts[0].split('-', 2)
         3246     if not date or len(date[0]) != 4:
         3247         return None
         3248     # Ensure that `date` has 3 elements. Using '1' sets the default
         3249     # month to January and the default day to the 1st of the month.
         3250     date.extend(['1'] * (3 - len(date)))
         3251     try:
         3252         year, month, day = [int(i) for i in date]
         3253     except ValueError:
         3254         # `date` may have more than 3 elements or may contain
         3255         # non-integer strings.
         3256         return None
         3257     if parts[1].endswith('z'):
         3258         parts[1] = parts[1][:-1]
         3259         parts.append('z')
         3260     # Append the numeric timezone offset, if any, to parts.
         3261     # If this is an MSSQL-style date then parts[2] already contains
         3262     # the timezone information, so `append()` will not affect it.
         3263     # Add 1 to each value so that if `find()` returns -1 it will be
         3264     # treated as False.
         3265     loc = parts[1].find('-') + 1 or parts[1].find('+') + 1 or len(parts[1]) + 1
         3266     loc = loc - 1
         3267     parts.append(parts[1][loc:])
         3268     parts[1] = parts[1][:loc]
         3269     time = parts[1].split(':', 2)
         3270     # Ensure that time has 3 elements. Using '0' means that the
         3271     # minutes and seconds, if missing, will default to 0.
         3272     time.extend(['0'] * (3 - len(time)))
         3273     tzhour = 0
         3274     tzmin = 0
         3275     if parts[2][:1] in ('-', '+'):
         3276         try:
         3277             tzhour = int(parts[2][1:3])
         3278             tzmin = int(parts[2][4:])
         3279         except ValueError:
         3280             return None
         3281         if parts[2].startswith('-'):
         3282             tzhour = tzhour * -1
         3283             tzmin = tzmin * -1
         3284     else:
         3285         tzhour = timezonenames.get(parts[2], 0)
         3286     try:
         3287         hour, minute, second = [int(float(i)) for i in time]
         3288     except ValueError:
         3289         return None
         3290     # Create the datetime object and timezone delta objects
         3291     try:
         3292         stamp = datetime.datetime(year, month, day, hour, minute, second)
         3293     except ValueError:
         3294         return None
         3295     delta = datetime.timedelta(0, 0, 0, 0, tzmin, tzhour)
         3296     # Return the date and timestamp in a UTC 9-tuple
         3297     try:
         3298         return (stamp - delta).utctimetuple()
         3299     except (OverflowError, ValueError):
         3300         # IronPython throws ValueErrors instead of OverflowErrors
         3301         return None
         3302 
         3303 registerDateHandler(_parse_date_w3dtf)
         3304 
         3305 def _parse_date_rfc822(date):
         3306     """Parse RFC 822 dates and times
         3307     http://tools.ietf.org/html/rfc822#section-5
         3308 
         3309     There are some formatting differences that are accounted for:
         3310     1. Years may be two or four digits.
         3311     2. The month and day can be swapped.
         3312     3. Additional timezone names are supported.
         3313     4. A default time and timezone are assumed if only a date is present.
         3314     """
         3315     daynames = set(['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'])
         3316     months = {
         3317         'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6,
         3318         'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12,
         3319     }
         3320 
         3321     parts = date.lower().split()
         3322     if len(parts) < 5:
         3323         # Assume that the time and timezone are missing
         3324         parts.extend(('00:00:00', '0000'))
         3325     # Remove the day name
         3326     if parts[0][:3] in daynames:
         3327         parts = parts[1:]
         3328     if len(parts) < 5:
         3329         # If there are still fewer than five parts, there's not enough
         3330         # information to interpret this
         3331         return None
         3332     try:
         3333         day = int(parts[0])
         3334     except ValueError:
         3335         # Check if the day and month are swapped
         3336         if months.get(parts[0][:3]):
         3337             try:
         3338                 day = int(parts[1])
         3339             except ValueError:
         3340                 return None
         3341             else:
         3342                 parts[1] = parts[0]
         3343         else:
         3344             return None
         3345     month = months.get(parts[1][:3])
         3346     if not month:
         3347         return None
         3348     try:
         3349         year = int(parts[2])
         3350     except ValueError:
         3351         return None
         3352     # Normalize two-digit years:
         3353     # Anything in the 90's is interpreted as 1990 and on
         3354     # Anything 89 or less is interpreted as 2089 or before
         3355     if len(parts[2]) <= 2:
         3356         year += (1900, 2000)[year < 90]
         3357     timeparts = parts[3].split(':')
         3358     timeparts = timeparts + ([0] * (3 - len(timeparts)))
         3359     try:
         3360         (hour, minute, second) = map(int, timeparts)
         3361     except ValueError:
         3362         return None
         3363     tzhour = 0
         3364     tzmin = 0
         3365     # Strip 'Etc/' from the timezone
         3366     if parts[4].startswith('etc/'):
         3367         parts[4] = parts[4][4:]
         3368     # Normalize timezones that start with 'gmt':
         3369     # GMT-05:00 => -0500
         3370     # GMT => GMT
         3371     if parts[4].startswith('gmt'):
         3372         parts[4] = ''.join(parts[4][3:].split(':')) or 'gmt'
         3373     # Handle timezones like '-0500', '+0500', and 'EST'
         3374     if parts[4] and parts[4][0] in ('-', '+'):
         3375         try:
         3376             tzhour = int(parts[4][1:3])
         3377             tzmin = int(parts[4][3:])
         3378         except ValueError:
         3379             return None
         3380         if parts[4].startswith('-'):
         3381             tzhour = tzhour * -1
         3382             tzmin = tzmin * -1
         3383     else:
         3384         tzhour = timezonenames.get(parts[4], 0)
         3385     # Create the datetime object and timezone delta objects
         3386     try:
         3387         stamp = datetime.datetime(year, month, day, hour, minute, second)
         3388     except ValueError:
         3389         return None
         3390     delta = datetime.timedelta(0, 0, 0, 0, tzmin, tzhour)
         3391     # Return the date and timestamp in a UTC 9-tuple
         3392     try:
         3393         return (stamp - delta).utctimetuple()
         3394     except (OverflowError, ValueError):
         3395         # IronPython throws ValueErrors instead of OverflowErrors
         3396         return None
         3397 registerDateHandler(_parse_date_rfc822)
         3398 
         3399 _months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun',
         3400            'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
         3401 def _parse_date_asctime(dt):
         3402     """Parse asctime-style dates.
         3403 
         3404     Converts asctime to RFC822-compatible dates and uses the RFC822 parser
         3405     to do the actual parsing.
         3406 
         3407     Supported formats (format is standardized to the first one listed):
         3408 
         3409     * {weekday name} {month name} dd hh:mm:ss {+-tz} yyyy
         3410     * {weekday name} {month name} dd hh:mm:ss yyyy
         3411     """
         3412 
         3413     parts = dt.split()
         3414 
         3415     # Insert a GMT timezone, if needed.
         3416     if len(parts) == 5:
         3417         parts.insert(4, '+0000')
         3418 
         3419     # Exit if there are not six parts.
         3420     if len(parts) != 6:
         3421         return None
         3422 
         3423     # Reassemble the parts in an RFC822-compatible order and parse them.
         3424     return _parse_date_rfc822(' '.join([
         3425         parts[0], parts[2], parts[1], parts[5], parts[3], parts[4],
         3426     ]))
         3427 registerDateHandler(_parse_date_asctime)
         3428 
         3429 def _parse_date_perforce(aDateString):
         3430     """parse a date in yyyy/mm/dd hh:mm:ss TTT format"""
         3431     # Fri, 2006/09/15 08:19:53 EDT
         3432     _my_date_pattern = re.compile( \
         3433         r'(\w{,3}), (\d{,4})/(\d{,2})/(\d{2}) (\d{,2}):(\d{2}):(\d{2}) (\w{,3})')
         3434 
         3435     m = _my_date_pattern.search(aDateString)
         3436     if m is None:
         3437         return None
         3438     dow, year, month, day, hour, minute, second, tz = m.groups()
         3439     months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
         3440     dateString = "%s, %s %s %s %s:%s:%s %s" % (dow, day, months[int(month) - 1], year, hour, minute, second, tz)
         3441     tm = rfc822.parsedate_tz(dateString)
         3442     if tm:
         3443         return time.gmtime(rfc822.mktime_tz(tm))
         3444 registerDateHandler(_parse_date_perforce)
         3445 
         3446 def _parse_date(dateString):
         3447     '''Parses a variety of date formats into a 9-tuple in GMT'''
         3448     if not dateString:
         3449         return None
         3450     for handler in _date_handlers:
         3451         try:
         3452             date9tuple = handler(dateString)
         3453         except (KeyError, OverflowError, ValueError):
         3454             continue
         3455         if not date9tuple:
         3456             continue
         3457         if len(date9tuple) != 9:
         3458             continue
         3459         return date9tuple
         3460     return None
         3461 
         3462 # Each marker represents some of the characters of the opening XML
         3463 # processing instruction ('<?xm') in the specified encoding.
         3464 EBCDIC_MARKER = _l2bytes([0x4C, 0x6F, 0xA7, 0x94])
         3465 UTF16BE_MARKER = _l2bytes([0x00, 0x3C, 0x00, 0x3F])
         3466 UTF16LE_MARKER = _l2bytes([0x3C, 0x00, 0x3F, 0x00])
         3467 UTF32BE_MARKER = _l2bytes([0x00, 0x00, 0x00, 0x3C])
         3468 UTF32LE_MARKER = _l2bytes([0x3C, 0x00, 0x00, 0x00])
         3469 
         3470 ZERO_BYTES = _l2bytes([0x00, 0x00])
         3471 
         3472 # Match the opening XML declaration.
         3473 # Example: <?xml version="1.0" encoding="utf-8"?>
         3474 RE_XML_DECLARATION = re.compile('^<\?xml[^>]*?>')
         3475 
         3476 # Capture the value of the XML processing instruction's encoding attribute.
         3477 # Example: <?xml version="1.0" encoding="utf-8"?>
         3478 RE_XML_PI_ENCODING = re.compile(_s2bytes('^<\?.*encoding=[\'"](.*?)[\'"].*\?>'))
         3479 
         3480 def convert_to_utf8(http_headers, data):
         3481     '''Detect and convert the character encoding to UTF-8.
         3482 
         3483     http_headers is a dictionary
         3484     data is a raw string (not Unicode)'''
         3485 
         3486     # This is so much trickier than it sounds, it's not even funny.
         3487     # According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type
         3488     # is application/xml, application/*+xml,
         3489     # application/xml-external-parsed-entity, or application/xml-dtd,
         3490     # the encoding given in the charset parameter of the HTTP Content-Type
         3491     # takes precedence over the encoding given in the XML prefix within the
         3492     # document, and defaults to 'utf-8' if neither are specified.  But, if
         3493     # the HTTP Content-Type is text/xml, text/*+xml, or
         3494     # text/xml-external-parsed-entity, the encoding given in the XML prefix
         3495     # within the document is ALWAYS IGNORED and only the encoding given in
         3496     # the charset parameter of the HTTP Content-Type header should be
         3497     # respected, and it defaults to 'us-ascii' if not specified.
         3498 
         3499     # Furthermore, discussion on the atom-syntax mailing list with the
         3500     # author of RFC 3023 leads me to the conclusion that any document
         3501     # served with a Content-Type of text/* and no charset parameter
         3502     # must be treated as us-ascii.  (We now do this.)  And also that it
         3503     # must always be flagged as non-well-formed.  (We now do this too.)
         3504 
         3505     # If Content-Type is unspecified (input was local file or non-HTTP source)
         3506     # or unrecognized (server just got it totally wrong), then go by the
         3507     # encoding given in the XML prefix of the document and default to
         3508     # 'iso-8859-1' as per the HTTP specification (RFC 2616).
         3509 
         3510     # Then, assuming we didn't find a character encoding in the HTTP headers
         3511     # (and the HTTP Content-type allowed us to look in the body), we need
         3512     # to sniff the first few bytes of the XML data and try to determine
         3513     # whether the encoding is ASCII-compatible.  Section F of the XML
         3514     # specification shows the way here:
         3515     # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
         3516 
         3517     # If the sniffed encoding is not ASCII-compatible, we need to make it
         3518     # ASCII compatible so that we can sniff further into the XML declaration
         3519     # to find the encoding attribute, which will tell us the true encoding.
         3520 
         3521     # Of course, none of this guarantees that we will be able to parse the
         3522     # feed in the declared character encoding (assuming it was declared
         3523     # correctly, which many are not).  iconv_codec can help a lot;
         3524     # you should definitely install it if you can.
         3525     # http://cjkpython.i18n.org/
         3526 
         3527     bom_encoding = u''
         3528     xml_encoding = u''
         3529     rfc3023_encoding = u''
         3530 
         3531     # Look at the first few bytes of the document to guess what
         3532     # its encoding may be. We only need to decode enough of the
         3533     # document that we can use an ASCII-compatible regular
         3534     # expression to search for an XML encoding declaration.
         3535     # The heuristic follows the XML specification, section F:
         3536     # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
         3537     # Check for BOMs first.
         3538     if data[:4] == codecs.BOM_UTF32_BE:
         3539         bom_encoding = u'utf-32be'
         3540         data = data[4:]
         3541     elif data[:4] == codecs.BOM_UTF32_LE:
         3542         bom_encoding = u'utf-32le'
         3543         data = data[4:]
         3544     elif data[:2] == codecs.BOM_UTF16_BE and data[2:4] != ZERO_BYTES:
         3545         bom_encoding = u'utf-16be'
         3546         data = data[2:]
         3547     elif data[:2] == codecs.BOM_UTF16_LE and data[2:4] != ZERO_BYTES:
         3548         bom_encoding = u'utf-16le'
         3549         data = data[2:]
         3550     elif data[:3] == codecs.BOM_UTF8:
         3551         bom_encoding = u'utf-8'
         3552         data = data[3:]
         3553     # Check for the characters '<?xm' in several encodings.
         3554     elif data[:4] == EBCDIC_MARKER:
         3555         bom_encoding = u'cp037'
         3556     elif data[:4] == UTF16BE_MARKER:
         3557         bom_encoding = u'utf-16be'
         3558     elif data[:4] == UTF16LE_MARKER:
         3559         bom_encoding = u'utf-16le'
         3560     elif data[:4] == UTF32BE_MARKER:
         3561         bom_encoding = u'utf-32be'
         3562     elif data[:4] == UTF32LE_MARKER:
         3563         bom_encoding = u'utf-32le'
         3564 
         3565     tempdata = data
         3566     try:
         3567         if bom_encoding:
         3568             tempdata = data.decode(bom_encoding).encode('utf-8')
         3569     except (UnicodeDecodeError, LookupError):
         3570         # feedparser recognizes UTF-32 encodings that aren't
         3571         # available in Python 2.4 and 2.5, so it's possible to
         3572         # encounter a LookupError during decoding.
         3573         xml_encoding_match = None
         3574     else:
         3575         xml_encoding_match = RE_XML_PI_ENCODING.match(tempdata)
         3576 
         3577     if xml_encoding_match:
         3578         xml_encoding = xml_encoding_match.groups()[0].decode('utf-8').lower()
         3579         # Normalize the xml_encoding if necessary.
         3580         if bom_encoding and (xml_encoding in (
         3581             u'u16', u'utf-16', u'utf16', u'utf_16',
         3582             u'u32', u'utf-32', u'utf32', u'utf_32',
         3583             u'iso-10646-ucs-2', u'iso-10646-ucs-4',
         3584             u'csucs4', u'csunicode', u'ucs-2', u'ucs-4'
         3585         )):
         3586             xml_encoding = bom_encoding
         3587 
         3588     # Find the HTTP Content-Type and, hopefully, a character
         3589     # encoding provided by the server. The Content-Type is used
         3590     # to choose the "correct" encoding among the BOM encoding,
         3591     # XML declaration encoding, and HTTP encoding, following the
         3592     # heuristic defined in RFC 3023.
         3593     http_content_type = http_headers.get('content-type') or ''
         3594     http_content_type, params = cgi.parse_header(http_content_type)
         3595     http_encoding = params.get('charset', '').replace("'", "")
         3596     if not isinstance(http_encoding, unicode):
         3597         http_encoding = http_encoding.decode('utf-8', 'ignore')
         3598 
         3599     acceptable_content_type = 0
         3600     application_content_types = (u'application/xml', u'application/xml-dtd',
         3601                                  u'application/xml-external-parsed-entity')
         3602     text_content_types = (u'text/xml', u'text/xml-external-parsed-entity')
         3603     if (http_content_type in application_content_types) or \
         3604        (http_content_type.startswith(u'application/') and
         3605         http_content_type.endswith(u'+xml')):
         3606         acceptable_content_type = 1
         3607         rfc3023_encoding = http_encoding or xml_encoding or u'utf-8'
         3608     elif (http_content_type in text_content_types) or \
         3609          (http_content_type.startswith(u'text/') and
         3610           http_content_type.endswith(u'+xml')):
         3611         acceptable_content_type = 1
         3612         rfc3023_encoding = http_encoding or u'us-ascii'
         3613     elif http_content_type.startswith(u'text/'):
         3614         rfc3023_encoding = http_encoding or u'us-ascii'
         3615     elif http_headers and 'content-type' not in http_headers:
         3616         rfc3023_encoding = xml_encoding or u'iso-8859-1'
         3617     else:
         3618         rfc3023_encoding = xml_encoding or u'utf-8'
         3619     # gb18030 is a superset of gb2312, so always replace gb2312
         3620     # with gb18030 for greater compatibility.
         3621     if rfc3023_encoding.lower() == u'gb2312':
         3622         rfc3023_encoding = u'gb18030'
         3623     if xml_encoding.lower() == u'gb2312':
         3624         xml_encoding = u'gb18030'
         3625 
         3626     # there are four encodings to keep track of:
         3627     # - http_encoding is the encoding declared in the Content-Type HTTP header
         3628     # - xml_encoding is the encoding declared in the <?xml declaration
         3629     # - bom_encoding is the encoding sniffed from the first 4 bytes of the XML data
         3630     # - rfc3023_encoding is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications
         3631     error = None
         3632 
         3633     if http_headers and (not acceptable_content_type):
         3634         if 'content-type' in http_headers:
         3635             msg = '%s is not an XML media type' % http_headers['content-type']
         3636         else:
         3637             msg = 'no Content-type specified'
         3638         error = NonXMLContentType(msg)
         3639 
         3640     # determine character encoding
         3641     known_encoding = 0
         3642     lazy_chardet_encoding = None
         3643     tried_encodings = []
         3644     if chardet:
         3645         def lazy_chardet_encoding():
         3646             chardet_encoding = chardet.detect(data)['encoding']
         3647             if not chardet_encoding:
         3648                 chardet_encoding = ''
         3649             if not isinstance(chardet_encoding, unicode):
         3650                 chardet_encoding = unicode(chardet_encoding, 'ascii', 'ignore')
         3651             return chardet_encoding
         3652     # try: HTTP encoding, declared XML encoding, encoding sniffed from BOM
         3653     for proposed_encoding in (rfc3023_encoding, xml_encoding, bom_encoding,
         3654                               lazy_chardet_encoding, u'utf-8', u'windows-1252', u'iso-8859-2'):
         3655         if callable(proposed_encoding):
         3656             proposed_encoding = proposed_encoding()
         3657         if not proposed_encoding:
         3658             continue
         3659         if proposed_encoding in tried_encodings:
         3660             continue
         3661         tried_encodings.append(proposed_encoding)
         3662         try:
         3663             data = data.decode(proposed_encoding)
         3664         except (UnicodeDecodeError, LookupError):
         3665             pass
         3666         else:
         3667             known_encoding = 1
         3668             # Update the encoding in the opening XML processing instruction.
         3669             new_declaration = '''<?xml version='1.0' encoding='utf-8'?>'''
         3670             if RE_XML_DECLARATION.search(data):
         3671                 data = RE_XML_DECLARATION.sub(new_declaration, data)
         3672             else:
         3673                 data = new_declaration + u'\n' + data
         3674             data = data.encode('utf-8')
         3675             break
         3676     # if still no luck, give up
         3677     if not known_encoding:
         3678         error = CharacterEncodingUnknown(
         3679             'document encoding unknown, I tried ' +
         3680             '%s, %s, utf-8, windows-1252, and iso-8859-2 but nothing worked' %
         3681             (rfc3023_encoding, xml_encoding))
         3682         rfc3023_encoding = u''
         3683     elif proposed_encoding != rfc3023_encoding:
         3684         error = CharacterEncodingOverride(
         3685             'document declared as %s, but parsed as %s' %
         3686             (rfc3023_encoding, proposed_encoding))
         3687         rfc3023_encoding = proposed_encoding
         3688 
         3689     return data, rfc3023_encoding, error
         3690 
         3691 # Match XML entity declarations.
         3692 # Example: <!ENTITY copyright "(C)">
         3693 RE_ENTITY_PATTERN = re.compile(_s2bytes(r'^\s*<!ENTITY([^>]*?)>'), re.MULTILINE)
         3694 
         3695 # Match XML DOCTYPE declarations.
         3696 # Example: <!DOCTYPE feed [ ]>
         3697 RE_DOCTYPE_PATTERN = re.compile(_s2bytes(r'^\s*<!DOCTYPE([^>]*?)>'), re.MULTILINE)
         3698 
         3699 # Match safe entity declarations.
         3700 # This will allow hexadecimal character references through,
         3701 # as well as text, but not arbitrary nested entities.
         3702 # Example: cubed "&#179;"
         3703 # Example: copyright "(C)"
         3704 # Forbidden: explode1 "&explode2;&explode2;"
         3705 RE_SAFE_ENTITY_PATTERN = re.compile(_s2bytes('\s+(\w+)\s+"(&#\w+;|[^&"]*)"'))
         3706 
         3707 def replace_doctype(data):
         3708     '''Strips and replaces the DOCTYPE, returns (rss_version, stripped_data)
         3709 
         3710     rss_version may be 'rss091n' or None
         3711     stripped_data is the same XML document with a replaced DOCTYPE
         3712     '''
         3713 
         3714     # Divide the document into two groups by finding the location
         3715     # of the first element that doesn't begin with '<?' or '<!'.
         3716     start = re.search(_s2bytes('<\w'), data)
         3717     start = start and start.start() or -1
         3718     head, data = data[:start+1], data[start+1:]
         3719 
         3720     # Save and then remove all of the ENTITY declarations.
         3721     entity_results = RE_ENTITY_PATTERN.findall(head)
         3722     head = RE_ENTITY_PATTERN.sub(_s2bytes(''), head)
         3723 
         3724     # Find the DOCTYPE declaration and check the feed type.
         3725     doctype_results = RE_DOCTYPE_PATTERN.findall(head)
         3726     doctype = doctype_results and doctype_results[0] or _s2bytes('')
         3727     if _s2bytes('netscape') in doctype.lower():
         3728         version = u'rss091n'
         3729     else:
         3730         version = None
         3731 
         3732     # Re-insert the safe ENTITY declarations if a DOCTYPE was found.
         3733     replacement = _s2bytes('')
         3734     if len(doctype_results) == 1 and entity_results:
         3735         match_safe_entities = lambda e: RE_SAFE_ENTITY_PATTERN.match(e)
         3736         safe_entities = filter(match_safe_entities, entity_results)
         3737         if safe_entities:
         3738             replacement = _s2bytes('<!DOCTYPE feed [\n<!ENTITY') \
         3739                         + _s2bytes('>\n<!ENTITY ').join(safe_entities) \
         3740                         + _s2bytes('>\n]>')
         3741     data = RE_DOCTYPE_PATTERN.sub(replacement, head) + data
         3742 
         3743     # Precompute the safe entities for the loose parser.
         3744     safe_entities = dict((k.decode('utf-8'), v.decode('utf-8'))
         3745                       for k, v in RE_SAFE_ENTITY_PATTERN.findall(replacement))
         3746     return version, data, safe_entities
         3747 
         3748 
         3749 # GeoRSS geometry parsers. Each return a dict with 'type' and 'coordinates'
         3750 # items, or None in the case of a parsing error.
         3751 
         3752 def _parse_poslist(value, geom_type, swap=True, dims=2):
         3753     if geom_type == 'linestring':
         3754         return _parse_georss_line(value, swap, dims)
         3755     elif geom_type == 'polygon':
         3756         ring = _parse_georss_line(value, swap, dims)
         3757         return {'type': u'Polygon', 'coordinates': (ring['coordinates'],)}
         3758     else:
         3759         return None
         3760 
         3761 def _gen_georss_coords(value, swap=True, dims=2):
         3762     # A generator of (lon, lat) pairs from a string of encoded GeoRSS
         3763     # coordinates. Converts to floats and swaps order.
         3764     latlons = itertools.imap(float, value.strip().replace(',', ' ').split())
         3765     nxt = latlons.next
         3766     while True:
         3767         t = [nxt(), nxt()][::swap and -1 or 1]
         3768         if dims == 3:
         3769             t.append(nxt())
         3770         yield tuple(t)
         3771 
         3772 def _parse_georss_point(value, swap=True, dims=2):
         3773     # A point contains a single latitude-longitude pair, separated by
         3774     # whitespace. We'll also handle comma separators.
         3775     try:
         3776         coords = list(_gen_georss_coords(value, swap, dims))
         3777         return {u'type': u'Point', u'coordinates': coords[0]}
         3778     except (IndexError, ValueError):
         3779         return None
         3780 
         3781 def _parse_georss_line(value, swap=True, dims=2):
         3782     # A line contains a space separated list of latitude-longitude pairs in
         3783     # WGS84 coordinate reference system, with each pair separated by
         3784     # whitespace. There must be at least two pairs.
         3785     try:
         3786         coords = list(_gen_georss_coords(value, swap, dims))
         3787         return {u'type': u'LineString', u'coordinates': coords}
         3788     except (IndexError, ValueError):
         3789         return None
         3790 
         3791 def _parse_georss_polygon(value, swap=True, dims=2):
         3792     # A polygon contains a space separated list of latitude-longitude pairs,
         3793     # with each pair separated by whitespace. There must be at least four
         3794     # pairs, with the last being identical to the first (so a polygon has a
         3795     # minimum of three actual points).
         3796     try:
         3797         ring = list(_gen_georss_coords(value, swap, dims))
         3798     except (IndexError, ValueError):
         3799         return None
         3800     if len(ring) < 4:
         3801         return None
         3802     return {u'type': u'Polygon', u'coordinates': (ring,)}
         3803 
         3804 def _parse_georss_box(value, swap=True, dims=2):
         3805     # A bounding box is a rectangular region, often used to define the extents
         3806     # of a map or a rough area of interest. A box contains two space seperate
         3807     # latitude-longitude pairs, with each pair separated by whitespace. The
         3808     # first pair is the lower corner, the second is the upper corner.
         3809     try:
         3810         coords = list(_gen_georss_coords(value, swap, dims))
         3811         return {u'type': u'Box', u'coordinates': tuple(coords)}
         3812     except (IndexError, ValueError):
         3813         return None
         3814 
         3815 # end geospatial parsers
         3816 
         3817 
         3818 def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=None, request_headers=None, response_headers=None):
         3819     '''Parse a feed from a URL, file, stream, or string.
         3820 
         3821     request_headers, if given, is a dict from http header name to value to add
         3822     to the request; this overrides internally generated values.
         3823 
         3824     :return: A :class:`FeedParserDict`.
         3825     '''
         3826 
         3827     if handlers is None:
         3828         handlers = []
         3829     if request_headers is None:
         3830         request_headers = {}
         3831     if response_headers is None:
         3832         response_headers = {}
         3833 
         3834     result = FeedParserDict()
         3835     result['feed'] = FeedParserDict()
         3836     result['entries'] = []
         3837     result['bozo'] = 0
         3838     if not isinstance(handlers, list):
         3839         handlers = [handlers]
         3840     try:
         3841         f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers)
         3842         data = f.read()
         3843     except Exception, e:
         3844         result['bozo'] = 1
         3845         result['bozo_exception'] = e
         3846         data = None
         3847         f = None
         3848 
         3849     if hasattr(f, 'headers'):
         3850         result['headers'] = dict(f.headers)
         3851     # overwrite existing headers using response_headers
         3852     if 'headers' in result:
         3853         result['headers'].update(response_headers)
         3854     elif response_headers:
         3855         result['headers'] = copy.deepcopy(response_headers)
         3856 
         3857     # lowercase all of the HTTP headers for comparisons per RFC 2616
         3858     if 'headers' in result:
         3859         http_headers = dict((k.lower(), v) for k, v in result['headers'].items())
         3860     else:
         3861         http_headers = {}
         3862 
         3863     # if feed is gzip-compressed, decompress it
         3864     if f and data and http_headers:
         3865         if gzip and 'gzip' in http_headers.get('content-encoding', ''):
         3866             try:
         3867                 data = gzip.GzipFile(fileobj=_StringIO(data)).read()
         3868             except (IOError, struct.error), e:
         3869                 # IOError can occur if the gzip header is bad.
         3870                 # struct.error can occur if the data is damaged.
         3871                 result['bozo'] = 1
         3872                 result['bozo_exception'] = e
         3873                 if isinstance(e, struct.error):
         3874                     # A gzip header was found but the data is corrupt.
         3875                     # Ideally, we should re-request the feed without the
         3876                     # 'Accept-encoding: gzip' header, but we don't.
         3877                     data = None
         3878         elif zlib and 'deflate' in http_headers.get('content-encoding', ''):
         3879             try:
         3880                 data = zlib.decompress(data)
         3881             except zlib.error, e:
         3882                 try:
         3883                     # The data may have no headers and no checksum.
         3884                     data = zlib.decompress(data, -15)
         3885                 except zlib.error, e:
         3886                     result['bozo'] = 1
         3887                     result['bozo_exception'] = e
         3888 
         3889     # save HTTP headers
         3890     if http_headers:
         3891         if 'etag' in http_headers:
         3892             etag = http_headers.get('etag', u'')
         3893             if not isinstance(etag, unicode):
         3894                 etag = etag.decode('utf-8', 'ignore')
         3895             if etag:
         3896                 result['etag'] = etag
         3897         if 'last-modified' in http_headers:
         3898             modified = http_headers.get('last-modified', u'')
         3899             if modified:
         3900                 result['modified'] = modified
         3901                 result['modified_parsed'] = _parse_date(modified)
         3902     if hasattr(f, 'url'):
         3903         if not isinstance(f.url, unicode):
         3904             result['href'] = f.url.decode('utf-8', 'ignore')
         3905         else:
         3906             result['href'] = f.url
         3907         result['status'] = 200
         3908     if hasattr(f, 'status'):
         3909         result['status'] = f.status
         3910     if hasattr(f, 'close'):
         3911         f.close()
         3912 
         3913     if data is None:
         3914         return result
         3915 
         3916     # Stop processing if the server sent HTTP 304 Not Modified.
         3917     if getattr(f, 'code', 0) == 304:
         3918         result['version'] = u''
         3919         result['debug_message'] = 'The feed has not changed since you last checked, ' + \
         3920             'so the server sent no data.  This is a feature, not a bug!'
         3921         return result
         3922 
         3923     data, result['encoding'], error = convert_to_utf8(http_headers, data)
         3924     use_strict_parser = result['encoding'] and True or False
         3925     if error is not None:
         3926         result['bozo'] = 1
         3927         result['bozo_exception'] = error
         3928 
         3929     result['version'], data, entities = replace_doctype(data)
         3930 
         3931     # Ensure that baseuri is an absolute URI using an acceptable URI scheme.
         3932     contentloc = http_headers.get('content-location', u'')
         3933     href = result.get('href', u'')
         3934     baseuri = _makeSafeAbsoluteURI(href, contentloc) or _makeSafeAbsoluteURI(contentloc) or href
         3935 
         3936     baselang = http_headers.get('content-language', None)
         3937     if not isinstance(baselang, unicode) and baselang is not None:
         3938         baselang = baselang.decode('utf-8', 'ignore')
         3939 
         3940     if not _XML_AVAILABLE:
         3941         use_strict_parser = 0
         3942     if use_strict_parser:
         3943         # initialize the SAX parser
         3944         feedparser = _StrictFeedParser(baseuri, baselang, 'utf-8')
         3945         saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS)
         3946         saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
         3947         try:
         3948             # disable downloading external doctype references, if possible
         3949             saxparser.setFeature(xml.sax.handler.feature_external_ges, 0)
         3950         except xml.sax.SAXNotSupportedException:
         3951             pass
         3952         saxparser.setContentHandler(feedparser)
         3953         saxparser.setErrorHandler(feedparser)
         3954         source = xml.sax.xmlreader.InputSource()
         3955         source.setByteStream(_StringIO(data))
         3956         try:
         3957             saxparser.parse(source)
         3958         except xml.sax.SAXException, e:
         3959             result['bozo'] = 1
         3960             result['bozo_exception'] = feedparser.exc or e
         3961             use_strict_parser = 0
         3962     if not use_strict_parser and _SGML_AVAILABLE:
         3963         feedparser = _LooseFeedParser(baseuri, baselang, 'utf-8', entities)
         3964         feedparser.feed(data.decode('utf-8', 'replace'))
         3965     result['feed'] = feedparser.feeddata
         3966     result['entries'] = feedparser.entries
         3967     result['version'] = result['version'] or feedparser.version
         3968     result['namespaces'] = feedparser.namespacesInUse
         3969     return result
         3970 
         3971 # The list of EPSG codes for geographic (latitude/longitude) coordinate
         3972 # systems to support decoding of GeoRSS GML profiles.
         3973 _geogCS = [
         3974 3819, 3821, 3824, 3889, 3906, 4001, 4002, 4003, 4004, 4005, 4006, 4007, 4008,
         3975 4009, 4010, 4011, 4012, 4013, 4014, 4015, 4016, 4018, 4019, 4020, 4021, 4022,
         3976 4023, 4024, 4025, 4027, 4028, 4029, 4030, 4031, 4032, 4033, 4034, 4035, 4036,
         3977 4041, 4042, 4043, 4044, 4045, 4046, 4047, 4052, 4053, 4054, 4055, 4075, 4081,
         3978 4120, 4121, 4122, 4123, 4124, 4125, 4126, 4127, 4128, 4129, 4130, 4131, 4132,
         3979 4133, 4134, 4135, 4136, 4137, 4138, 4139, 4140, 4141, 4142, 4143, 4144, 4145,
         3980 4146, 4147, 4148, 4149, 4150, 4151, 4152, 4153, 4154, 4155, 4156, 4157, 4158,
         3981 4159, 4160, 4161, 4162, 4163, 4164, 4165, 4166, 4167, 4168, 4169, 4170, 4171,
         3982 4172, 4173, 4174, 4175, 4176, 4178, 4179, 4180, 4181, 4182, 4183, 4184, 4185,
         3983 4188, 4189, 4190, 4191, 4192, 4193, 4194, 4195, 4196, 4197, 4198, 4199, 4200,
         3984 4201, 4202, 4203, 4204, 4205, 4206, 4207, 4208, 4209, 4210, 4211, 4212, 4213,
         3985 4214, 4215, 4216, 4218, 4219, 4220, 4221, 4222, 4223, 4224, 4225, 4226, 4227,
         3986 4228, 4229, 4230, 4231, 4232, 4233, 4234, 4235, 4236, 4237, 4238, 4239, 4240,
         3987 4241, 4242, 4243, 4244, 4245, 4246, 4247, 4248, 4249, 4250, 4251, 4252, 4253,
         3988 4254, 4255, 4256, 4257, 4258, 4259, 4260, 4261, 4262, 4263, 4264, 4265, 4266,
         3989 4267, 4268, 4269, 4270, 4271, 4272, 4273, 4274, 4275, 4276, 4277, 4278, 4279,
         3990 4280, 4281, 4282, 4283, 4284, 4285, 4286, 4287, 4288, 4289, 4291, 4292, 4293,
         3991 4294, 4295, 4296, 4297, 4298, 4299, 4300, 4301, 4302, 4303, 4304, 4306, 4307,
         3992 4308, 4309, 4310, 4311, 4312, 4313, 4314, 4315, 4316, 4317, 4318, 4319, 4322,
         3993 4324, 4326, 4463, 4470, 4475, 4483, 4490, 4555, 4558, 4600, 4601, 4602, 4603,
         3994 4604, 4605, 4606, 4607, 4608, 4609, 4610, 4611, 4612, 4613, 4614, 4615, 4616,
         3995 4617, 4618, 4619, 4620, 4621, 4622, 4623, 4624, 4625, 4626, 4627, 4628, 4629,
         3996 4630, 4631, 4632, 4633, 4634, 4635, 4636, 4637, 4638, 4639, 4640, 4641, 4642,
         3997 4643, 4644, 4645, 4646, 4657, 4658, 4659, 4660, 4661, 4662, 4663, 4664, 4665,
         3998 4666, 4667, 4668, 4669, 4670, 4671, 4672, 4673, 4674, 4675, 4676, 4677, 4678,
         3999 4679, 4680, 4681, 4682, 4683, 4684, 4685, 4686, 4687, 4688, 4689, 4690, 4691,
         4000 4692, 4693, 4694, 4695, 4696, 4697, 4698, 4699, 4700, 4701, 4702, 4703, 4704,
         4001 4705, 4706, 4707, 4708, 4709, 4710, 4711, 4712, 4713, 4714, 4715, 4716, 4717,
         4002 4718, 4719, 4720, 4721, 4722, 4723, 4724, 4725, 4726, 4727, 4728, 4729, 4730,
         4003 4731, 4732, 4733, 4734, 4735, 4736, 4737, 4738, 4739, 4740, 4741, 4742, 4743,
         4004 4744, 4745, 4746, 4747, 4748, 4749, 4750, 4751, 4752, 4753, 4754, 4755, 4756,
         4005 4757, 4758, 4759, 4760, 4761, 4762, 4763, 4764, 4765, 4801, 4802, 4803, 4804,
         4006 4805, 4806, 4807, 4808, 4809, 4810, 4811, 4813, 4814, 4815, 4816, 4817, 4818,
         4007 4819, 4820, 4821, 4823, 4824, 4901, 4902, 4903, 4904, 4979 ]
         4008