feedparser.py - gamingskill - A Linux gaming news skill for Amazon Alexa, so I could get monthly AWS credits.
(HTM) git clone git://jay.scot/gamingskill
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
---
feedparser.py (160057B)
---
1 """Universal feed parser
2
3 Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds
4
5 Visit https://code.google.com/p/feedparser/ for the latest version
6 Visit http://packages.python.org/feedparser/ for the latest documentation
7
8 Required: Python 2.4 or later
9 Recommended: iconv_codec <http://cjkpython.i18n.org/>
10 """
11
12 __version__ = "5.2.1"
13 __license__ = """
14 Copyright 2010-2015 Kurt McKee <contactme@kurtmckee.org>
15 Copyright 2002-2008 Mark Pilgrim
16 All rights reserved.
17
18 Redistribution and use in source and binary forms, with or without modification,
19 are permitted provided that the following conditions are met:
20
21 * Redistributions of source code must retain the above copyright notice,
22 this list of conditions and the following disclaimer.
23 * Redistributions in binary form must reproduce the above copyright notice,
24 this list of conditions and the following disclaimer in the documentation
25 and/or other materials provided with the distribution.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE."""
38 __author__ = "Mark Pilgrim <http://diveintomark.org/>"
39 __contributors__ = ["Jason Diamond <http://injektilo.org/>",
40 "John Beimler <http://john.beimler.org/>",
41 "Fazal Majid <http://www.majid.info/mylos/weblog/>",
42 "Aaron Swartz <http://aaronsw.com/>",
43 "Kevin Marks <http://epeus.blogspot.com/>",
44 "Sam Ruby <http://intertwingly.net/>",
45 "Ade Oshineye <http://blog.oshineye.com/>",
46 "Martin Pool <http://sourcefrog.net/>",
47 "Kurt McKee <http://kurtmckee.org/>",
48 "Bernd Schlapsi <https://github.com/brot>",]
49
50 # HTTP "User-Agent" header to send to servers when downloading feeds.
51 # If you are embedding feedparser in a larger application, you should
52 # change this to your application name and URL.
53 USER_AGENT = "UniversalFeedParser/%s +https://code.google.com/p/feedparser/" % __version__
54
55 # HTTP "Accept" header to send to servers when downloading feeds. If you don't
56 # want to send an Accept header, set this to None.
57 ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
58
59 # List of preferred XML parsers, by SAX driver name. These will be tried first,
60 # but if they're not installed, Python will keep searching through its own list
61 # of pre-installed parsers until it finds one that supports everything we need.
62 PREFERRED_XML_PARSERS = ["drv_libxml2"]
63
64 # If you want feedparser to automatically resolve all relative URIs, set this
65 # to 1.
66 RESOLVE_RELATIVE_URIS = 1
67
68 # If you want feedparser to automatically sanitize all potentially unsafe
69 # HTML content, set this to 1.
70 SANITIZE_HTML = 1
71
72 # ---------- Python 3 modules (make it work if possible) ----------
73 try:
74 import rfc822
75 except ImportError:
76 from email import _parseaddr as rfc822
77
78 try:
79 # Python 3.1 introduces bytes.maketrans and simultaneously
80 # deprecates string.maketrans; use bytes.maketrans if possible
81 _maketrans = bytes.maketrans
82 except (NameError, AttributeError):
83 import string
84 _maketrans = string.maketrans
85
86 # base64 support for Atom feeds that contain embedded binary data
87 try:
88 import base64, binascii
89 except ImportError:
90 base64 = binascii = None
91 else:
92 # Python 3.1 deprecates decodestring in favor of decodebytes
93 _base64decode = getattr(base64, 'decodebytes', base64.decodestring)
94
95 # _s2bytes: convert a UTF-8 str to bytes if the interpreter is Python 3
96 # _l2bytes: convert a list of ints to bytes if the interpreter is Python 3
97 try:
98 if bytes is str:
99 # In Python 2.5 and below, bytes doesn't exist (NameError)
100 # In Python 2.6 and above, bytes and str are the same type
101 raise NameError
102 except NameError:
103 # Python 2
104 def _s2bytes(s):
105 return s
106 def _l2bytes(l):
107 return ''.join(map(chr, l))
108 else:
109 # Python 3
110 def _s2bytes(s):
111 return bytes(s, 'utf8')
112 def _l2bytes(l):
113 return bytes(l)
114
115 # If you want feedparser to allow all URL schemes, set this to ()
116 # List culled from Python's urlparse documentation at:
117 # http://docs.python.org/library/urlparse.html
118 # as well as from "URI scheme" at Wikipedia:
119 # https://secure.wikimedia.org/wikipedia/en/wiki/URI_scheme
120 # Many more will likely need to be added!
121 ACCEPTABLE_URI_SCHEMES = (
122 'file', 'ftp', 'gopher', 'h323', 'hdl', 'http', 'https', 'imap', 'magnet',
123 'mailto', 'mms', 'news', 'nntp', 'prospero', 'rsync', 'rtsp', 'rtspu',
124 'sftp', 'shttp', 'sip', 'sips', 'snews', 'svn', 'svn+ssh', 'telnet',
125 'wais',
126 # Additional common-but-unofficial schemes
127 'aim', 'callto', 'cvs', 'facetime', 'feed', 'git', 'gtalk', 'irc', 'ircs',
128 'irc6', 'itms', 'mms', 'msnim', 'skype', 'ssh', 'smb', 'svn', 'ymsg',
129 )
130 #ACCEPTABLE_URI_SCHEMES = ()
131
132 # ---------- required modules (should come with any Python distribution) ----------
133 import cgi
134 import codecs
135 import copy
136 import datetime
137 import itertools
138 import re
139 import struct
140 import time
141 import types
142 import urllib
143 import urllib2
144 import urlparse
145 import warnings
146
147 from htmlentitydefs import name2codepoint, codepoint2name, entitydefs
148
149 try:
150 from io import BytesIO as _StringIO
151 except ImportError:
152 try:
153 from cStringIO import StringIO as _StringIO
154 except ImportError:
155 from StringIO import StringIO as _StringIO
156
157 # ---------- optional modules (feedparser will work without these, but with reduced functionality) ----------
158
159 # gzip is included with most Python distributions, but may not be available if you compiled your own
160 try:
161 import gzip
162 except ImportError:
163 gzip = None
164 try:
165 import zlib
166 except ImportError:
167 zlib = None
168
169 # If a real XML parser is available, feedparser will attempt to use it. feedparser has
170 # been tested with the built-in SAX parser and libxml2. On platforms where the
171 # Python distribution does not come with an XML parser (such as Mac OS X 10.2 and some
172 # versions of FreeBSD), feedparser will quietly fall back on regex-based parsing.
173 try:
174 import xml.sax
175 from xml.sax.saxutils import escape as _xmlescape
176 except ImportError:
177 _XML_AVAILABLE = 0
178 def _xmlescape(data,entities={}):
179 data = data.replace('&', '&')
180 data = data.replace('>', '>')
181 data = data.replace('<', '<')
182 for char, entity in entities:
183 data = data.replace(char, entity)
184 return data
185 else:
186 try:
187 xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers
188 except xml.sax.SAXReaderNotAvailable:
189 _XML_AVAILABLE = 0
190 else:
191 _XML_AVAILABLE = 1
192
193 # sgmllib is not available by default in Python 3; if the end user doesn't have
194 # it available then we'll lose illformed XML parsing and content santizing
195 try:
196 import sgmllib
197 except ImportError:
198 # This is probably Python 3, which doesn't include sgmllib anymore
199 _SGML_AVAILABLE = 0
200
201 # Mock sgmllib enough to allow subclassing later on
202 class sgmllib(object):
203 class SGMLParser(object):
204 def goahead(self, i):
205 pass
206 def parse_starttag(self, i):
207 pass
208 else:
209 _SGML_AVAILABLE = 1
210
211 # sgmllib defines a number of module-level regular expressions that are
212 # insufficient for the XML parsing feedparser needs. Rather than modify
213 # the variables directly in sgmllib, they're defined here using the same
214 # names, and the compiled code objects of several sgmllib.SGMLParser
215 # methods are copied into _BaseHTMLProcessor so that they execute in
216 # feedparser's scope instead of sgmllib's scope.
217 charref = re.compile('&#(\d+|[xX][0-9a-fA-F]+);')
218 tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
219 attrfind = re.compile(
220 r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)[$]?(\s*=\s*'
221 r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?'
222 )
223
224 # Unfortunately, these must be copied over to prevent NameError exceptions
225 entityref = sgmllib.entityref
226 incomplete = sgmllib.incomplete
227 interesting = sgmllib.interesting
228 shorttag = sgmllib.shorttag
229 shorttagopen = sgmllib.shorttagopen
230 starttagopen = sgmllib.starttagopen
231
232 class _EndBracketRegEx:
233 def __init__(self):
234 # Overriding the built-in sgmllib.endbracket regex allows the
235 # parser to find angle brackets embedded in element attributes.
236 self.endbracket = re.compile('''([^'"<>]|"[^"]*"(?=>|/|\s|\w+=)|'[^']*'(?=>|/|\s|\w+=))*(?=[<>])|.*?(?=[<>])''')
237 def search(self, target, index=0):
238 match = self.endbracket.match(target, index)
239 if match is not None:
240 # Returning a new object in the calling thread's context
241 # resolves a thread-safety.
242 return EndBracketMatch(match)
243 return None
244 class EndBracketMatch:
245 def __init__(self, match):
246 self.match = match
247 def start(self, n):
248 return self.match.end(n)
249 endbracket = _EndBracketRegEx()
250
251
252 # iconv_codec provides support for more character encodings.
253 # It's available from http://cjkpython.i18n.org/
254 try:
255 import iconv_codec
256 except ImportError:
257 pass
258
259 # chardet library auto-detects character encodings
260 # Download from http://chardet.feedparser.org/
261 try:
262 import chardet
263 except ImportError:
264 chardet = None
265
266 # ---------- don't touch these ----------
267 class ThingsNobodyCaresAboutButMe(Exception): pass
268 class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass
269 class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): pass
270 class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass
271 class UndeclaredNamespace(Exception): pass
272
273 SUPPORTED_VERSIONS = {'': u'unknown',
274 'rss090': u'RSS 0.90',
275 'rss091n': u'RSS 0.91 (Netscape)',
276 'rss091u': u'RSS 0.91 (Userland)',
277 'rss092': u'RSS 0.92',
278 'rss093': u'RSS 0.93',
279 'rss094': u'RSS 0.94',
280 'rss20': u'RSS 2.0',
281 'rss10': u'RSS 1.0',
282 'rss': u'RSS (unknown version)',
283 'atom01': u'Atom 0.1',
284 'atom02': u'Atom 0.2',
285 'atom03': u'Atom 0.3',
286 'atom10': u'Atom 1.0',
287 'atom': u'Atom (unknown version)',
288 'cdf': u'CDF',
289 }
290
291 class FeedParserDict(dict):
292 keymap = {'channel': 'feed',
293 'items': 'entries',
294 'guid': 'id',
295 'date': 'updated',
296 'date_parsed': 'updated_parsed',
297 'description': ['summary', 'subtitle'],
298 'description_detail': ['summary_detail', 'subtitle_detail'],
299 'url': ['href'],
300 'modified': 'updated',
301 'modified_parsed': 'updated_parsed',
302 'issued': 'published',
303 'issued_parsed': 'published_parsed',
304 'copyright': 'rights',
305 'copyright_detail': 'rights_detail',
306 'tagline': 'subtitle',
307 'tagline_detail': 'subtitle_detail'}
308 def __getitem__(self, key):
309 '''
310 :return: A :class:`FeedParserDict`.
311 '''
312 if key == 'category':
313 try:
314 return dict.__getitem__(self, 'tags')[0]['term']
315 except IndexError:
316 raise KeyError, "object doesn't have key 'category'"
317 elif key == 'enclosures':
318 norel = lambda link: FeedParserDict([(name,value) for (name,value) in link.items() if name!='rel'])
319 return [norel(link) for link in dict.__getitem__(self, 'links') if link['rel']==u'enclosure']
320 elif key == 'license':
321 for link in dict.__getitem__(self, 'links'):
322 if link['rel']==u'license' and 'href' in link:
323 return link['href']
324 elif key == 'updated':
325 # Temporarily help developers out by keeping the old
326 # broken behavior that was reported in issue 310.
327 # This fix was proposed in issue 328.
328 if not dict.__contains__(self, 'updated') and \
329 dict.__contains__(self, 'published'):
330 warnings.warn("To avoid breaking existing software while "
331 "fixing issue 310, a temporary mapping has been created "
332 "from `updated` to `published` if `updated` doesn't "
333 "exist. This fallback will be removed in a future version "
334 "of feedparser.", DeprecationWarning)
335 return dict.__getitem__(self, 'published')
336 return dict.__getitem__(self, 'updated')
337 elif key == 'updated_parsed':
338 if not dict.__contains__(self, 'updated_parsed') and \
339 dict.__contains__(self, 'published_parsed'):
340 warnings.warn("To avoid breaking existing software while "
341 "fixing issue 310, a temporary mapping has been created "
342 "from `updated_parsed` to `published_parsed` if "
343 "`updated_parsed` doesn't exist. This fallback will be "
344 "removed in a future version of feedparser.",
345 DeprecationWarning)
346 return dict.__getitem__(self, 'published_parsed')
347 return dict.__getitem__(self, 'updated_parsed')
348 else:
349 realkey = self.keymap.get(key, key)
350 if isinstance(realkey, list):
351 for k in realkey:
352 if dict.__contains__(self, k):
353 return dict.__getitem__(self, k)
354 elif dict.__contains__(self, realkey):
355 return dict.__getitem__(self, realkey)
356 return dict.__getitem__(self, key)
357
358 def __contains__(self, key):
359 if key in ('updated', 'updated_parsed'):
360 # Temporarily help developers out by keeping the old
361 # broken behavior that was reported in issue 310.
362 # This fix was proposed in issue 328.
363 return dict.__contains__(self, key)
364 try:
365 self.__getitem__(key)
366 except KeyError:
367 return False
368 else:
369 return True
370
371 has_key = __contains__
372
373 def get(self, key, default=None):
374 '''
375 :return: A :class:`FeedParserDict`.
376 '''
377 try:
378 return self.__getitem__(key)
379 except KeyError:
380 return default
381
382 def __setitem__(self, key, value):
383 key = self.keymap.get(key, key)
384 if isinstance(key, list):
385 key = key[0]
386 return dict.__setitem__(self, key, value)
387
388 def setdefault(self, key, value):
389 if key not in self:
390 self[key] = value
391 return value
392 return self[key]
393
394 def __getattr__(self, key):
395 # __getattribute__() is called first; this will be called
396 # only if an attribute was not already found
397 try:
398 return self.__getitem__(key)
399 except KeyError:
400 raise AttributeError, "object has no attribute '%s'" % key
401
402 def __hash__(self):
403 return id(self)
404
405 _cp1252 = {
406 128: unichr(8364), # euro sign
407 130: unichr(8218), # single low-9 quotation mark
408 131: unichr( 402), # latin small letter f with hook
409 132: unichr(8222), # double low-9 quotation mark
410 133: unichr(8230), # horizontal ellipsis
411 134: unichr(8224), # dagger
412 135: unichr(8225), # double dagger
413 136: unichr( 710), # modifier letter circumflex accent
414 137: unichr(8240), # per mille sign
415 138: unichr( 352), # latin capital letter s with caron
416 139: unichr(8249), # single left-pointing angle quotation mark
417 140: unichr( 338), # latin capital ligature oe
418 142: unichr( 381), # latin capital letter z with caron
419 145: unichr(8216), # left single quotation mark
420 146: unichr(8217), # right single quotation mark
421 147: unichr(8220), # left double quotation mark
422 148: unichr(8221), # right double quotation mark
423 149: unichr(8226), # bullet
424 150: unichr(8211), # en dash
425 151: unichr(8212), # em dash
426 152: unichr( 732), # small tilde
427 153: unichr(8482), # trade mark sign
428 154: unichr( 353), # latin small letter s with caron
429 155: unichr(8250), # single right-pointing angle quotation mark
430 156: unichr( 339), # latin small ligature oe
431 158: unichr( 382), # latin small letter z with caron
432 159: unichr( 376), # latin capital letter y with diaeresis
433 }
434
435 _urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)')
436 def _urljoin(base, uri):
437 uri = _urifixer.sub(r'\1\3', uri)
438 if not isinstance(uri, unicode):
439 uri = uri.decode('utf-8', 'ignore')
440 try:
441 uri = urlparse.urljoin(base, uri)
442 except ValueError:
443 uri = u''
444 if not isinstance(uri, unicode):
445 return uri.decode('utf-8', 'ignore')
446 return uri
447
448 class _FeedParserMixin:
449 namespaces = {
450 '': '',
451 'http://backend.userland.com/rss': '',
452 'http://blogs.law.harvard.edu/tech/rss': '',
453 'http://purl.org/rss/1.0/': '',
454 'http://my.netscape.com/rdf/simple/0.9/': '',
455 'http://example.com/newformat#': '',
456 'http://example.com/necho': '',
457 'http://purl.org/echo/': '',
458 'uri/of/echo/namespace#': '',
459 'http://purl.org/pie/': '',
460 'http://purl.org/atom/ns#': '',
461 'http://www.w3.org/2005/Atom': '',
462 'http://purl.org/rss/1.0/modules/rss091#': '',
463
464 'http://webns.net/mvcb/': 'admin',
465 'http://purl.org/rss/1.0/modules/aggregation/': 'ag',
466 'http://purl.org/rss/1.0/modules/annotate/': 'annotate',
467 'http://media.tangent.org/rss/1.0/': 'audio',
468 'http://backend.userland.com/blogChannelModule': 'blogChannel',
469 'http://web.resource.org/cc/': 'cc',
470 'http://backend.userland.com/creativeCommonsRssModule': 'creativeCommons',
471 'http://purl.org/rss/1.0/modules/company': 'co',
472 'http://purl.org/rss/1.0/modules/content/': 'content',
473 'http://my.theinfo.org/changed/1.0/rss/': 'cp',
474 'http://purl.org/dc/elements/1.1/': 'dc',
475 'http://purl.org/dc/terms/': 'dcterms',
476 'http://purl.org/rss/1.0/modules/email/': 'email',
477 'http://purl.org/rss/1.0/modules/event/': 'ev',
478 'http://rssnamespace.org/feedburner/ext/1.0': 'feedburner',
479 'http://freshmeat.net/rss/fm/': 'fm',
480 'http://xmlns.com/foaf/0.1/': 'foaf',
481 'http://www.w3.org/2003/01/geo/wgs84_pos#': 'geo',
482 'http://www.georss.org/georss': 'georss',
483 'http://www.opengis.net/gml': 'gml',
484 'http://postneo.com/icbm/': 'icbm',
485 'http://purl.org/rss/1.0/modules/image/': 'image',
486 'http://www.itunes.com/DTDs/PodCast-1.0.dtd': 'itunes',
487 'http://example.com/DTDs/PodCast-1.0.dtd': 'itunes',
488 'http://purl.org/rss/1.0/modules/link/': 'l',
489 'http://search.yahoo.com/mrss': 'media',
490 # Version 1.1.2 of the Media RSS spec added the trailing slash on the namespace
491 'http://search.yahoo.com/mrss/': 'media',
492 'http://madskills.com/public/xml/rss/module/pingback/': 'pingback',
493 'http://prismstandard.org/namespaces/1.2/basic/': 'prism',
494 'http://www.w3.org/1999/02/22-rdf-syntax-ns#': 'rdf',
495 'http://www.w3.org/2000/01/rdf-schema#': 'rdfs',
496 'http://purl.org/rss/1.0/modules/reference/': 'ref',
497 'http://purl.org/rss/1.0/modules/richequiv/': 'reqv',
498 'http://purl.org/rss/1.0/modules/search/': 'search',
499 'http://purl.org/rss/1.0/modules/slash/': 'slash',
500 'http://schemas.xmlsoap.org/soap/envelope/': 'soap',
501 'http://purl.org/rss/1.0/modules/servicestatus/': 'ss',
502 'http://hacks.benhammersley.com/rss/streaming/': 'str',
503 'http://purl.org/rss/1.0/modules/subscription/': 'sub',
504 'http://purl.org/rss/1.0/modules/syndication/': 'sy',
505 'http://schemas.pocketsoap.com/rss/myDescModule/': 'szf',
506 'http://purl.org/rss/1.0/modules/taxonomy/': 'taxo',
507 'http://purl.org/rss/1.0/modules/threading/': 'thr',
508 'http://purl.org/rss/1.0/modules/textinput/': 'ti',
509 'http://madskills.com/public/xml/rss/module/trackback/': 'trackback',
510 'http://wellformedweb.org/commentAPI/': 'wfw',
511 'http://purl.org/rss/1.0/modules/wiki/': 'wiki',
512 'http://www.w3.org/1999/xhtml': 'xhtml',
513 'http://www.w3.org/1999/xlink': 'xlink',
514 'http://www.w3.org/XML/1998/namespace': 'xml',
515 'http://podlove.org/simple-chapters': 'psc',
516 }
517 _matchnamespaces = {}
518
519 can_be_relative_uri = set(['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'icon', 'logo'])
520 can_contain_relative_uris = set(['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description'])
521 can_contain_dangerous_markup = set(['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description'])
522 html_types = [u'text/html', u'application/xhtml+xml']
523
524 def __init__(self, baseuri=None, baselang=None, encoding=u'utf-8'):
525 if not self._matchnamespaces:
526 for k, v in self.namespaces.items():
527 self._matchnamespaces[k.lower()] = v
528 self.feeddata = FeedParserDict() # feed-level data
529 self.encoding = encoding # character encoding
530 self.entries = [] # list of entry-level data
531 self.version = u'' # feed type/version, see SUPPORTED_VERSIONS
532 self.namespacesInUse = {} # dictionary of namespaces defined by the feed
533
534 # the following are used internally to track state;
535 # this is really out of control and should be refactored
536 self.infeed = 0
537 self.inentry = 0
538 self.incontent = 0
539 self.intextinput = 0
540 self.inimage = 0
541 self.inauthor = 0
542 self.incontributor = 0
543 self.inpublisher = 0
544 self.insource = 0
545
546 # georss
547 self.ingeometry = 0
548
549 self.sourcedata = FeedParserDict()
550 self.contentparams = FeedParserDict()
551 self._summaryKey = None
552 self.namespacemap = {}
553 self.elementstack = []
554 self.basestack = []
555 self.langstack = []
556 self.baseuri = baseuri or u''
557 self.lang = baselang or None
558 self.svgOK = 0
559 self.title_depth = -1
560 self.depth = 0
561 # psc_chapters_flag prevents multiple psc_chapters from being
562 # captured in a single entry or item. The transition states are
563 # None -> True -> False. psc_chapter elements will only be
564 # captured while it is True.
565 self.psc_chapters_flag = None
566 if baselang:
567 self.feeddata['language'] = baselang.replace('_','-')
568
569 # A map of the following form:
570 # {
571 # object_that_value_is_set_on: {
572 # property_name: depth_of_node_property_was_extracted_from,
573 # other_property: depth_of_node_property_was_extracted_from,
574 # },
575 # }
576 self.property_depth_map = {}
577
578 def _normalize_attributes(self, kv):
579 k = kv[0].lower()
580 v = k in ('rel', 'type') and kv[1].lower() or kv[1]
581 # the sgml parser doesn't handle entities in attributes, nor
582 # does it pass the attribute values through as unicode, while
583 # strict xml parsers do -- account for this difference
584 if isinstance(self, _LooseFeedParser):
585 v = v.replace('&', '&')
586 if not isinstance(v, unicode):
587 v = v.decode('utf-8')
588 return (k, v)
589
590 def unknown_starttag(self, tag, attrs):
591 # increment depth counter
592 self.depth += 1
593
594 # normalize attrs
595 attrs = map(self._normalize_attributes, attrs)
596
597 # track xml:base and xml:lang
598 attrsD = dict(attrs)
599 baseuri = attrsD.get('xml:base', attrsD.get('base')) or self.baseuri
600 if not isinstance(baseuri, unicode):
601 baseuri = baseuri.decode(self.encoding, 'ignore')
602 # ensure that self.baseuri is always an absolute URI that
603 # uses a whitelisted URI scheme (e.g. not `javscript:`)
604 if self.baseuri:
605 self.baseuri = _makeSafeAbsoluteURI(self.baseuri, baseuri) or self.baseuri
606 else:
607 self.baseuri = _urljoin(self.baseuri, baseuri)
608 lang = attrsD.get('xml:lang', attrsD.get('lang'))
609 if lang == '':
610 # xml:lang could be explicitly set to '', we need to capture that
611 lang = None
612 elif lang is None:
613 # if no xml:lang is specified, use parent lang
614 lang = self.lang
615 if lang:
616 if tag in ('feed', 'rss', 'rdf:RDF'):
617 self.feeddata['language'] = lang.replace('_','-')
618 self.lang = lang
619 self.basestack.append(self.baseuri)
620 self.langstack.append(lang)
621
622 # track namespaces
623 for prefix, uri in attrs:
624 if prefix.startswith('xmlns:'):
625 self.trackNamespace(prefix[6:], uri)
626 elif prefix == 'xmlns':
627 self.trackNamespace(None, uri)
628
629 # track inline content
630 if self.incontent and not self.contentparams.get('type', u'xml').endswith(u'xml'):
631 if tag in ('xhtml:div', 'div'):
632 return # typepad does this 10/2007
633 # element declared itself as escaped markup, but it isn't really
634 self.contentparams['type'] = u'application/xhtml+xml'
635 if self.incontent and self.contentparams.get('type') == u'application/xhtml+xml':
636 if tag.find(':') <> -1:
637 prefix, tag = tag.split(':', 1)
638 namespace = self.namespacesInUse.get(prefix, '')
639 if tag=='math' and namespace=='http://www.w3.org/1998/Math/MathML':
640 attrs.append(('xmlns',namespace))
641 if tag=='svg' and namespace=='http://www.w3.org/2000/svg':
642 attrs.append(('xmlns',namespace))
643 if tag == 'svg':
644 self.svgOK += 1
645 return self.handle_data('<%s%s>' % (tag, self.strattrs(attrs)), escape=0)
646
647 # match namespaces
648 if tag.find(':') <> -1:
649 prefix, suffix = tag.split(':', 1)
650 else:
651 prefix, suffix = '', tag
652 prefix = self.namespacemap.get(prefix, prefix)
653 if prefix:
654 prefix = prefix + '_'
655
656 # special hack for better tracking of empty textinput/image elements in illformed feeds
657 if (not prefix) and tag not in ('title', 'link', 'description', 'name'):
658 self.intextinput = 0
659 if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'href', 'width', 'height'):
660 self.inimage = 0
661
662 # call special handler (if defined) or default handler
663 methodname = '_start_' + prefix + suffix
664 try:
665 method = getattr(self, methodname)
666 return method(attrsD)
667 except AttributeError:
668 # Since there's no handler or something has gone wrong we explicitly add the element and its attributes
669 unknown_tag = prefix + suffix
670 if len(attrsD) == 0:
671 # No attributes so merge it into the encosing dictionary
672 return self.push(unknown_tag, 1)
673 else:
674 # Has attributes so create it in its own dictionary
675 context = self._getContext()
676 context[unknown_tag] = attrsD
677
678 def unknown_endtag(self, tag):
679 # match namespaces
680 if tag.find(':') <> -1:
681 prefix, suffix = tag.split(':', 1)
682 else:
683 prefix, suffix = '', tag
684 prefix = self.namespacemap.get(prefix, prefix)
685 if prefix:
686 prefix = prefix + '_'
687 if suffix == 'svg' and self.svgOK:
688 self.svgOK -= 1
689
690 # call special handler (if defined) or default handler
691 methodname = '_end_' + prefix + suffix
692 try:
693 if self.svgOK:
694 raise AttributeError()
695 method = getattr(self, methodname)
696 method()
697 except AttributeError:
698 self.pop(prefix + suffix)
699
700 # track inline content
701 if self.incontent and not self.contentparams.get('type', u'xml').endswith(u'xml'):
702 # element declared itself as escaped markup, but it isn't really
703 if tag in ('xhtml:div', 'div'):
704 return # typepad does this 10/2007
705 self.contentparams['type'] = u'application/xhtml+xml'
706 if self.incontent and self.contentparams.get('type') == u'application/xhtml+xml':
707 tag = tag.split(':')[-1]
708 self.handle_data('</%s>' % tag, escape=0)
709
710 # track xml:base and xml:lang going out of scope
711 if self.basestack:
712 self.basestack.pop()
713 if self.basestack and self.basestack[-1]:
714 self.baseuri = self.basestack[-1]
715 if self.langstack:
716 self.langstack.pop()
717 if self.langstack: # and (self.langstack[-1] is not None):
718 self.lang = self.langstack[-1]
719
720 self.depth -= 1
721
722 def handle_charref(self, ref):
723 # called for each character reference, e.g. for ' ', ref will be '160'
724 if not self.elementstack:
725 return
726 ref = ref.lower()
727 if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'):
728 text = '&#%s;' % ref
729 else:
730 if ref[0] == 'x':
731 c = int(ref[1:], 16)
732 else:
733 c = int(ref)
734 text = unichr(c).encode('utf-8')
735 self.elementstack[-1][2].append(text)
736
737 def handle_entityref(self, ref):
738 # called for each entity reference, e.g. for '©', ref will be 'copy'
739 if not self.elementstack:
740 return
741 if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
742 text = '&%s;' % ref
743 elif ref in self.entities:
744 text = self.entities[ref]
745 if text.startswith('&#') and text.endswith(';'):
746 return self.handle_entityref(text)
747 else:
748 try:
749 name2codepoint[ref]
750 except KeyError:
751 text = '&%s;' % ref
752 else:
753 text = unichr(name2codepoint[ref]).encode('utf-8')
754 self.elementstack[-1][2].append(text)
755
756 def handle_data(self, text, escape=1):
757 # called for each block of plain text, i.e. outside of any tag and
758 # not containing any character or entity references
759 if not self.elementstack:
760 return
761 if escape and self.contentparams.get('type') == u'application/xhtml+xml':
762 text = _xmlescape(text)
763 self.elementstack[-1][2].append(text)
764
765 def handle_comment(self, text):
766 # called for each comment, e.g. <!-- insert message here -->
767 pass
768
769 def handle_pi(self, text):
770 # called for each processing instruction, e.g. <?instruction>
771 pass
772
773 def handle_decl(self, text):
774 pass
775
776 def parse_declaration(self, i):
777 # override internal declaration handler to handle CDATA blocks
778 if self.rawdata[i:i+9] == '<![CDATA[':
779 k = self.rawdata.find(']]>', i)
780 if k == -1:
781 # CDATA block began but didn't finish
782 k = len(self.rawdata)
783 return k
784 self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0)
785 return k+3
786 else:
787 k = self.rawdata.find('>', i)
788 if k >= 0:
789 return k+1
790 else:
791 # We have an incomplete CDATA block.
792 return k
793
794 def mapContentType(self, contentType):
795 contentType = contentType.lower()
796 if contentType == 'text' or contentType == 'plain':
797 contentType = u'text/plain'
798 elif contentType == 'html':
799 contentType = u'text/html'
800 elif contentType == 'xhtml':
801 contentType = u'application/xhtml+xml'
802 return contentType
803
804 def trackNamespace(self, prefix, uri):
805 loweruri = uri.lower()
806 if not self.version:
807 if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/'):
808 self.version = u'rss090'
809 elif loweruri == 'http://purl.org/rss/1.0/':
810 self.version = u'rss10'
811 elif loweruri == 'http://www.w3.org/2005/atom':
812 self.version = u'atom10'
813 if loweruri.find(u'backend.userland.com/rss') <> -1:
814 # match any backend.userland.com namespace
815 uri = u'http://backend.userland.com/rss'
816 loweruri = uri
817 if loweruri in self._matchnamespaces:
818 self.namespacemap[prefix] = self._matchnamespaces[loweruri]
819 self.namespacesInUse[self._matchnamespaces[loweruri]] = uri
820 else:
821 self.namespacesInUse[prefix or ''] = uri
822
823 def resolveURI(self, uri):
824 return _urljoin(self.baseuri or u'', uri)
825
826 def decodeEntities(self, element, data):
827 return data
828
829 def strattrs(self, attrs):
830 return ''.join([' %s="%s"' % (t[0],_xmlescape(t[1],{'"':'"'})) for t in attrs])
831
832 def push(self, element, expectingText):
833 self.elementstack.append([element, expectingText, []])
834
835 def pop(self, element, stripWhitespace=1):
836 if not self.elementstack:
837 return
838 if self.elementstack[-1][0] != element:
839 return
840
841 element, expectingText, pieces = self.elementstack.pop()
842
843 if self.version == u'atom10' and self.contentparams.get('type', u'text') == u'application/xhtml+xml':
844 # remove enclosing child element, but only if it is a <div> and
845 # only if all the remaining content is nested underneath it.
846 # This means that the divs would be retained in the following:
847 # <div>foo</div><div>bar</div>
848 while pieces and len(pieces)>1 and not pieces[-1].strip():
849 del pieces[-1]
850 while pieces and len(pieces)>1 and not pieces[0].strip():
851 del pieces[0]
852 if pieces and (pieces[0] == '<div>' or pieces[0].startswith('<div ')) and pieces[-1]=='</div>':
853 depth = 0
854 for piece in pieces[:-1]:
855 if piece.startswith('</'):
856 depth -= 1
857 if depth == 0:
858 break
859 elif piece.startswith('<') and not piece.endswith('/>'):
860 depth += 1
861 else:
862 pieces = pieces[1:-1]
863
864 # Ensure each piece is a str for Python 3
865 for (i, v) in enumerate(pieces):
866 if not isinstance(v, unicode):
867 pieces[i] = v.decode('utf-8')
868
869 output = u''.join(pieces)
870 if stripWhitespace:
871 output = output.strip()
872 if not expectingText:
873 return output
874
875 # decode base64 content
876 if base64 and self.contentparams.get('base64', 0):
877 try:
878 output = _base64decode(output)
879 except binascii.Error:
880 pass
881 except binascii.Incomplete:
882 pass
883 except TypeError:
884 # In Python 3, base64 takes and outputs bytes, not str
885 # This may not be the most correct way to accomplish this
886 output = _base64decode(output.encode('utf-8')).decode('utf-8')
887
888 # resolve relative URIs
889 if (element in self.can_be_relative_uri) and output:
890 # do not resolve guid elements with isPermalink="false"
891 if not element == 'id' or self.guidislink:
892 output = self.resolveURI(output)
893
894 # decode entities within embedded markup
895 if not self.contentparams.get('base64', 0):
896 output = self.decodeEntities(element, output)
897
898 # some feed formats require consumers to guess
899 # whether the content is html or plain text
900 if not self.version.startswith(u'atom') and self.contentparams.get('type') == u'text/plain':
901 if self.lookslikehtml(output):
902 self.contentparams['type'] = u'text/html'
903
904 # remove temporary cruft from contentparams
905 try:
906 del self.contentparams['mode']
907 except KeyError:
908 pass
909 try:
910 del self.contentparams['base64']
911 except KeyError:
912 pass
913
914 is_htmlish = self.mapContentType(self.contentparams.get('type', u'text/html')) in self.html_types
915 # resolve relative URIs within embedded markup
916 if is_htmlish and RESOLVE_RELATIVE_URIS:
917 if element in self.can_contain_relative_uris:
918 output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', u'text/html'))
919
920 # sanitize embedded markup
921 if is_htmlish and SANITIZE_HTML:
922 if element in self.can_contain_dangerous_markup:
923 output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', u'text/html'))
924
925 if self.encoding and not isinstance(output, unicode):
926 output = output.decode(self.encoding, 'ignore')
927
928 # address common error where people take data that is already
929 # utf-8, presume that it is iso-8859-1, and re-encode it.
930 if self.encoding in (u'utf-8', u'utf-8_INVALID_PYTHON_3') and isinstance(output, unicode):
931 try:
932 output = output.encode('iso-8859-1').decode('utf-8')
933 except (UnicodeEncodeError, UnicodeDecodeError):
934 pass
935
936 # map win-1252 extensions to the proper code points
937 if isinstance(output, unicode):
938 output = output.translate(_cp1252)
939
940 # categories/tags/keywords/whatever are handled in _end_category or _end_tags or _end_itunes_keywords
941 if element in ('category', 'tags', 'itunes_keywords'):
942 return output
943
944 if element == 'title' and -1 < self.title_depth <= self.depth:
945 return output
946
947 # store output in appropriate place(s)
948 if self.inentry and not self.insource:
949 if element == 'content':
950 self.entries[-1].setdefault(element, [])
951 contentparams = copy.deepcopy(self.contentparams)
952 contentparams['value'] = output
953 self.entries[-1][element].append(contentparams)
954 elif element == 'link':
955 if not self.inimage:
956 # query variables in urls in link elements are improperly
957 # converted from `?a=1&b=2` to `?a=1&b;=2` as if they're
958 # unhandled character references. fix this special case.
959 output = output.replace('&', '&')
960 output = re.sub("&([A-Za-z0-9_]+);", "&\g<1>", output)
961 self.entries[-1][element] = output
962 if output:
963 self.entries[-1]['links'][-1]['href'] = output
964 else:
965 if element == 'description':
966 element = 'summary'
967 old_value_depth = self.property_depth_map.setdefault(self.entries[-1], {}).get(element)
968 if old_value_depth is None or self.depth <= old_value_depth:
969 self.property_depth_map[self.entries[-1]][element] = self.depth
970 self.entries[-1][element] = output
971 if self.incontent:
972 contentparams = copy.deepcopy(self.contentparams)
973 contentparams['value'] = output
974 self.entries[-1][element + '_detail'] = contentparams
975 elif (self.infeed or self.insource):# and (not self.intextinput) and (not self.inimage):
976 context = self._getContext()
977 if element == 'description':
978 element = 'subtitle'
979 context[element] = output
980 if element == 'link':
981 # fix query variables; see above for the explanation
982 output = re.sub("&([A-Za-z0-9_]+);", "&\g<1>", output)
983 context[element] = output
984 context['links'][-1]['href'] = output
985 elif self.incontent:
986 contentparams = copy.deepcopy(self.contentparams)
987 contentparams['value'] = output
988 context[element + '_detail'] = contentparams
989 return output
990
991 def pushContent(self, tag, attrsD, defaultContentType, expectingText):
992 self.incontent += 1
993 if self.lang:
994 self.lang=self.lang.replace('_','-')
995 self.contentparams = FeedParserDict({
996 'type': self.mapContentType(attrsD.get('type', defaultContentType)),
997 'language': self.lang,
998 'base': self.baseuri})
999 self.contentparams['base64'] = self._isBase64(attrsD, self.contentparams)
1000 self.push(tag, expectingText)
1001
1002 def popContent(self, tag):
1003 value = self.pop(tag)
1004 self.incontent -= 1
1005 self.contentparams.clear()
1006 return value
1007
1008 # a number of elements in a number of RSS variants are nominally plain
1009 # text, but this is routinely ignored. This is an attempt to detect
1010 # the most common cases. As false positives often result in silent
1011 # data loss, this function errs on the conservative side.
1012 @staticmethod
1013 def lookslikehtml(s):
1014 # must have a close tag or an entity reference to qualify
1015 if not (re.search(r'</(\w+)>',s) or re.search("&#?\w+;",s)):
1016 return
1017
1018 # all tags must be in a restricted subset of valid HTML tags
1019 if filter(lambda t: t.lower() not in _HTMLSanitizer.acceptable_elements,
1020 re.findall(r'</?(\w+)',s)):
1021 return
1022
1023 # all entities must have been defined as valid HTML entities
1024 if filter(lambda e: e not in entitydefs.keys(), re.findall(r'&(\w+);', s)):
1025 return
1026
1027 return 1
1028
1029 def _mapToStandardPrefix(self, name):
1030 colonpos = name.find(':')
1031 if colonpos <> -1:
1032 prefix = name[:colonpos]
1033 suffix = name[colonpos+1:]
1034 prefix = self.namespacemap.get(prefix, prefix)
1035 name = prefix + ':' + suffix
1036 return name
1037
1038 def _getAttribute(self, attrsD, name):
1039 return attrsD.get(self._mapToStandardPrefix(name))
1040
1041 def _isBase64(self, attrsD, contentparams):
1042 if attrsD.get('mode', '') == 'base64':
1043 return 1
1044 if self.contentparams['type'].startswith(u'text/'):
1045 return 0
1046 if self.contentparams['type'].endswith(u'+xml'):
1047 return 0
1048 if self.contentparams['type'].endswith(u'/xml'):
1049 return 0
1050 return 1
1051
1052 def _itsAnHrefDamnIt(self, attrsD):
1053 href = attrsD.get('url', attrsD.get('uri', attrsD.get('href', None)))
1054 if href:
1055 try:
1056 del attrsD['url']
1057 except KeyError:
1058 pass
1059 try:
1060 del attrsD['uri']
1061 except KeyError:
1062 pass
1063 attrsD['href'] = href
1064 return attrsD
1065
1066 def _save(self, key, value, overwrite=False):
1067 context = self._getContext()
1068 if overwrite:
1069 context[key] = value
1070 else:
1071 context.setdefault(key, value)
1072
1073 def _start_rss(self, attrsD):
1074 versionmap = {'0.91': u'rss091u',
1075 '0.92': u'rss092',
1076 '0.93': u'rss093',
1077 '0.94': u'rss094'}
1078 #If we're here then this is an RSS feed.
1079 #If we don't have a version or have a version that starts with something
1080 #other than RSS then there's been a mistake. Correct it.
1081 if not self.version or not self.version.startswith(u'rss'):
1082 attr_version = attrsD.get('version', '')
1083 version = versionmap.get(attr_version)
1084 if version:
1085 self.version = version
1086 elif attr_version.startswith('2.'):
1087 self.version = u'rss20'
1088 else:
1089 self.version = u'rss'
1090
1091 def _start_channel(self, attrsD):
1092 self.infeed = 1
1093 self._cdf_common(attrsD)
1094
1095 def _cdf_common(self, attrsD):
1096 if 'lastmod' in attrsD:
1097 self._start_modified({})
1098 self.elementstack[-1][-1] = attrsD['lastmod']
1099 self._end_modified()
1100 if 'href' in attrsD:
1101 self._start_link({})
1102 self.elementstack[-1][-1] = attrsD['href']
1103 self._end_link()
1104
1105 def _start_feed(self, attrsD):
1106 self.infeed = 1
1107 versionmap = {'0.1': u'atom01',
1108 '0.2': u'atom02',
1109 '0.3': u'atom03'}
1110 if not self.version:
1111 attr_version = attrsD.get('version')
1112 version = versionmap.get(attr_version)
1113 if version:
1114 self.version = version
1115 else:
1116 self.version = u'atom'
1117
1118 def _end_channel(self):
1119 self.infeed = 0
1120 _end_feed = _end_channel
1121
1122 def _start_image(self, attrsD):
1123 context = self._getContext()
1124 if not self.inentry:
1125 context.setdefault('image', FeedParserDict())
1126 self.inimage = 1
1127 self.title_depth = -1
1128 self.push('image', 0)
1129
1130 def _end_image(self):
1131 self.pop('image')
1132 self.inimage = 0
1133
1134 def _start_textinput(self, attrsD):
1135 context = self._getContext()
1136 context.setdefault('textinput', FeedParserDict())
1137 self.intextinput = 1
1138 self.title_depth = -1
1139 self.push('textinput', 0)
1140 _start_textInput = _start_textinput
1141
1142 def _end_textinput(self):
1143 self.pop('textinput')
1144 self.intextinput = 0
1145 _end_textInput = _end_textinput
1146
1147 def _start_author(self, attrsD):
1148 self.inauthor = 1
1149 self.push('author', 1)
1150 # Append a new FeedParserDict when expecting an author
1151 context = self._getContext()
1152 context.setdefault('authors', [])
1153 context['authors'].append(FeedParserDict())
1154 _start_managingeditor = _start_author
1155 _start_dc_author = _start_author
1156 _start_dc_creator = _start_author
1157 _start_itunes_author = _start_author
1158
1159 def _end_author(self):
1160 self.pop('author')
1161 self.inauthor = 0
1162 self._sync_author_detail()
1163 _end_managingeditor = _end_author
1164 _end_dc_author = _end_author
1165 _end_dc_creator = _end_author
1166 _end_itunes_author = _end_author
1167
1168 def _start_itunes_owner(self, attrsD):
1169 self.inpublisher = 1
1170 self.push('publisher', 0)
1171
1172 def _end_itunes_owner(self):
1173 self.pop('publisher')
1174 self.inpublisher = 0
1175 self._sync_author_detail('publisher')
1176
1177 def _start_contributor(self, attrsD):
1178 self.incontributor = 1
1179 context = self._getContext()
1180 context.setdefault('contributors', [])
1181 context['contributors'].append(FeedParserDict())
1182 self.push('contributor', 0)
1183
1184 def _end_contributor(self):
1185 self.pop('contributor')
1186 self.incontributor = 0
1187
1188 def _start_dc_contributor(self, attrsD):
1189 self.incontributor = 1
1190 context = self._getContext()
1191 context.setdefault('contributors', [])
1192 context['contributors'].append(FeedParserDict())
1193 self.push('name', 0)
1194
1195 def _end_dc_contributor(self):
1196 self._end_name()
1197 self.incontributor = 0
1198
1199 def _start_name(self, attrsD):
1200 self.push('name', 0)
1201 _start_itunes_name = _start_name
1202
1203 def _end_name(self):
1204 value = self.pop('name')
1205 if self.inpublisher:
1206 self._save_author('name', value, 'publisher')
1207 elif self.inauthor:
1208 self._save_author('name', value)
1209 elif self.incontributor:
1210 self._save_contributor('name', value)
1211 elif self.intextinput:
1212 context = self._getContext()
1213 context['name'] = value
1214 _end_itunes_name = _end_name
1215
1216 def _start_width(self, attrsD):
1217 self.push('width', 0)
1218
1219 def _end_width(self):
1220 value = self.pop('width')
1221 try:
1222 value = int(value)
1223 except ValueError:
1224 value = 0
1225 if self.inimage:
1226 context = self._getContext()
1227 context['width'] = value
1228
1229 def _start_height(self, attrsD):
1230 self.push('height', 0)
1231
1232 def _end_height(self):
1233 value = self.pop('height')
1234 try:
1235 value = int(value)
1236 except ValueError:
1237 value = 0
1238 if self.inimage:
1239 context = self._getContext()
1240 context['height'] = value
1241
1242 def _start_url(self, attrsD):
1243 self.push('href', 1)
1244 _start_homepage = _start_url
1245 _start_uri = _start_url
1246
1247 def _end_url(self):
1248 value = self.pop('href')
1249 if self.inauthor:
1250 self._save_author('href', value)
1251 elif self.incontributor:
1252 self._save_contributor('href', value)
1253 _end_homepage = _end_url
1254 _end_uri = _end_url
1255
1256 def _start_email(self, attrsD):
1257 self.push('email', 0)
1258 _start_itunes_email = _start_email
1259
1260 def _end_email(self):
1261 value = self.pop('email')
1262 if self.inpublisher:
1263 self._save_author('email', value, 'publisher')
1264 elif self.inauthor:
1265 self._save_author('email', value)
1266 elif self.incontributor:
1267 self._save_contributor('email', value)
1268 _end_itunes_email = _end_email
1269
1270 def _getContext(self):
1271 if self.insource:
1272 context = self.sourcedata
1273 elif self.inimage and 'image' in self.feeddata:
1274 context = self.feeddata['image']
1275 elif self.intextinput:
1276 context = self.feeddata['textinput']
1277 elif self.inentry:
1278 context = self.entries[-1]
1279 else:
1280 context = self.feeddata
1281 return context
1282
1283 def _save_author(self, key, value, prefix='author'):
1284 context = self._getContext()
1285 context.setdefault(prefix + '_detail', FeedParserDict())
1286 context[prefix + '_detail'][key] = value
1287 self._sync_author_detail()
1288 context.setdefault('authors', [FeedParserDict()])
1289 context['authors'][-1][key] = value
1290
1291 def _save_contributor(self, key, value):
1292 context = self._getContext()
1293 context.setdefault('contributors', [FeedParserDict()])
1294 context['contributors'][-1][key] = value
1295
1296 def _sync_author_detail(self, key='author'):
1297 context = self._getContext()
1298 detail = context.get('%ss' % key, [FeedParserDict()])[-1]
1299 if detail:
1300 name = detail.get('name')
1301 email = detail.get('email')
1302 if name and email:
1303 context[key] = u'%s (%s)' % (name, email)
1304 elif name:
1305 context[key] = name
1306 elif email:
1307 context[key] = email
1308 else:
1309 author, email = context.get(key), None
1310 if not author:
1311 return
1312 emailmatch = re.search(ur'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))(\?subject=\S+)?''', author)
1313 if emailmatch:
1314 email = emailmatch.group(0)
1315 # probably a better way to do the following, but it passes all the tests
1316 author = author.replace(email, u'')
1317 author = author.replace(u'()', u'')
1318 author = author.replace(u'<>', u'')
1319 author = author.replace(u'<>', u'')
1320 author = author.strip()
1321 if author and (author[0] == u'('):
1322 author = author[1:]
1323 if author and (author[-1] == u')'):
1324 author = author[:-1]
1325 author = author.strip()
1326 if author or email:
1327 context.setdefault('%s_detail' % key, detail)
1328 if author:
1329 detail['name'] = author
1330 if email:
1331 detail['email'] = email
1332
1333 def _start_subtitle(self, attrsD):
1334 self.pushContent('subtitle', attrsD, u'text/plain', 1)
1335 _start_tagline = _start_subtitle
1336 _start_itunes_subtitle = _start_subtitle
1337
1338 def _end_subtitle(self):
1339 self.popContent('subtitle')
1340 _end_tagline = _end_subtitle
1341 _end_itunes_subtitle = _end_subtitle
1342
1343 def _start_rights(self, attrsD):
1344 self.pushContent('rights', attrsD, u'text/plain', 1)
1345 _start_dc_rights = _start_rights
1346 _start_copyright = _start_rights
1347
1348 def _end_rights(self):
1349 self.popContent('rights')
1350 _end_dc_rights = _end_rights
1351 _end_copyright = _end_rights
1352
1353 def _start_item(self, attrsD):
1354 self.entries.append(FeedParserDict())
1355 self.push('item', 0)
1356 self.inentry = 1
1357 self.guidislink = 0
1358 self.title_depth = -1
1359 self.psc_chapters_flag = None
1360 id = self._getAttribute(attrsD, 'rdf:about')
1361 if id:
1362 context = self._getContext()
1363 context['id'] = id
1364 self._cdf_common(attrsD)
1365 _start_entry = _start_item
1366
1367 def _end_item(self):
1368 self.pop('item')
1369 self.inentry = 0
1370 _end_entry = _end_item
1371
1372 def _start_dc_language(self, attrsD):
1373 self.push('language', 1)
1374 _start_language = _start_dc_language
1375
1376 def _end_dc_language(self):
1377 self.lang = self.pop('language')
1378 _end_language = _end_dc_language
1379
1380 def _start_dc_publisher(self, attrsD):
1381 self.push('publisher', 1)
1382 _start_webmaster = _start_dc_publisher
1383
1384 def _end_dc_publisher(self):
1385 self.pop('publisher')
1386 self._sync_author_detail('publisher')
1387 _end_webmaster = _end_dc_publisher
1388
1389 def _start_dcterms_valid(self, attrsD):
1390 self.push('validity', 1)
1391
1392 def _end_dcterms_valid(self):
1393 for validity_detail in self.pop('validity').split(';'):
1394 if '=' in validity_detail:
1395 key, value = validity_detail.split('=', 1)
1396 if key == 'start':
1397 self._save('validity_start', value, overwrite=True)
1398 self._save('validity_start_parsed', _parse_date(value), overwrite=True)
1399 elif key == 'end':
1400 self._save('validity_end', value, overwrite=True)
1401 self._save('validity_end_parsed', _parse_date(value), overwrite=True)
1402
1403 def _start_published(self, attrsD):
1404 self.push('published', 1)
1405 _start_dcterms_issued = _start_published
1406 _start_issued = _start_published
1407 _start_pubdate = _start_published
1408
1409 def _end_published(self):
1410 value = self.pop('published')
1411 self._save('published_parsed', _parse_date(value), overwrite=True)
1412 _end_dcterms_issued = _end_published
1413 _end_issued = _end_published
1414 _end_pubdate = _end_published
1415
1416 def _start_updated(self, attrsD):
1417 self.push('updated', 1)
1418 _start_modified = _start_updated
1419 _start_dcterms_modified = _start_updated
1420 _start_dc_date = _start_updated
1421 _start_lastbuilddate = _start_updated
1422
1423 def _end_updated(self):
1424 value = self.pop('updated')
1425 parsed_value = _parse_date(value)
1426 self._save('updated_parsed', parsed_value, overwrite=True)
1427 _end_modified = _end_updated
1428 _end_dcterms_modified = _end_updated
1429 _end_dc_date = _end_updated
1430 _end_lastbuilddate = _end_updated
1431
1432 def _start_created(self, attrsD):
1433 self.push('created', 1)
1434 _start_dcterms_created = _start_created
1435
1436 def _end_created(self):
1437 value = self.pop('created')
1438 self._save('created_parsed', _parse_date(value), overwrite=True)
1439 _end_dcterms_created = _end_created
1440
1441 def _start_expirationdate(self, attrsD):
1442 self.push('expired', 1)
1443
1444 def _end_expirationdate(self):
1445 self._save('expired_parsed', _parse_date(self.pop('expired')), overwrite=True)
1446
1447 # geospatial location, or "where", from georss.org
1448
1449 def _start_georssgeom(self, attrsD):
1450 self.push('geometry', 0)
1451 context = self._getContext()
1452 context['where'] = FeedParserDict()
1453
1454 _start_georss_point = _start_georssgeom
1455 _start_georss_line = _start_georssgeom
1456 _start_georss_polygon = _start_georssgeom
1457 _start_georss_box = _start_georssgeom
1458
1459 def _save_where(self, geometry):
1460 context = self._getContext()
1461 context['where'].update(geometry)
1462
1463 def _end_georss_point(self):
1464 geometry = _parse_georss_point(self.pop('geometry'))
1465 if geometry:
1466 self._save_where(geometry)
1467
1468 def _end_georss_line(self):
1469 geometry = _parse_georss_line(self.pop('geometry'))
1470 if geometry:
1471 self._save_where(geometry)
1472
1473 def _end_georss_polygon(self):
1474 this = self.pop('geometry')
1475 geometry = _parse_georss_polygon(this)
1476 if geometry:
1477 self._save_where(geometry)
1478
1479 def _end_georss_box(self):
1480 geometry = _parse_georss_box(self.pop('geometry'))
1481 if geometry:
1482 self._save_where(geometry)
1483
1484 def _start_where(self, attrsD):
1485 self.push('where', 0)
1486 context = self._getContext()
1487 context['where'] = FeedParserDict()
1488 _start_georss_where = _start_where
1489
1490 def _parse_srs_attrs(self, attrsD):
1491 srsName = attrsD.get('srsname')
1492 try:
1493 srsDimension = int(attrsD.get('srsdimension', '2'))
1494 except ValueError:
1495 srsDimension = 2
1496 context = self._getContext()
1497 context['where']['srsName'] = srsName
1498 context['where']['srsDimension'] = srsDimension
1499
1500 def _start_gml_point(self, attrsD):
1501 self._parse_srs_attrs(attrsD)
1502 self.ingeometry = 1
1503 self.push('geometry', 0)
1504
1505 def _start_gml_linestring(self, attrsD):
1506 self._parse_srs_attrs(attrsD)
1507 self.ingeometry = 'linestring'
1508 self.push('geometry', 0)
1509
1510 def _start_gml_polygon(self, attrsD):
1511 self._parse_srs_attrs(attrsD)
1512 self.push('geometry', 0)
1513
1514 def _start_gml_exterior(self, attrsD):
1515 self.push('geometry', 0)
1516
1517 def _start_gml_linearring(self, attrsD):
1518 self.ingeometry = 'polygon'
1519 self.push('geometry', 0)
1520
1521 def _start_gml_pos(self, attrsD):
1522 self.push('pos', 0)
1523
1524 def _end_gml_pos(self):
1525 this = self.pop('pos')
1526 context = self._getContext()
1527 srsName = context['where'].get('srsName')
1528 srsDimension = context['where'].get('srsDimension', 2)
1529 swap = True
1530 if srsName and "EPSG" in srsName:
1531 epsg = int(srsName.split(":")[-1])
1532 swap = bool(epsg in _geogCS)
1533 geometry = _parse_georss_point(this, swap=swap, dims=srsDimension)
1534 if geometry:
1535 self._save_where(geometry)
1536
1537 def _start_gml_poslist(self, attrsD):
1538 self.push('pos', 0)
1539
1540 def _end_gml_poslist(self):
1541 this = self.pop('pos')
1542 context = self._getContext()
1543 srsName = context['where'].get('srsName')
1544 srsDimension = context['where'].get('srsDimension', 2)
1545 swap = True
1546 if srsName and "EPSG" in srsName:
1547 epsg = int(srsName.split(":")[-1])
1548 swap = bool(epsg in _geogCS)
1549 geometry = _parse_poslist(
1550 this, self.ingeometry, swap=swap, dims=srsDimension)
1551 if geometry:
1552 self._save_where(geometry)
1553
1554 def _end_geom(self):
1555 self.ingeometry = 0
1556 self.pop('geometry')
1557 _end_gml_point = _end_geom
1558 _end_gml_linestring = _end_geom
1559 _end_gml_linearring = _end_geom
1560 _end_gml_exterior = _end_geom
1561 _end_gml_polygon = _end_geom
1562
1563 def _end_where(self):
1564 self.pop('where')
1565 _end_georss_where = _end_where
1566
1567 # end geospatial
1568
1569 def _start_cc_license(self, attrsD):
1570 context = self._getContext()
1571 value = self._getAttribute(attrsD, 'rdf:resource')
1572 attrsD = FeedParserDict()
1573 attrsD['rel'] = u'license'
1574 if value:
1575 attrsD['href']=value
1576 context.setdefault('links', []).append(attrsD)
1577
1578 def _start_creativecommons_license(self, attrsD):
1579 self.push('license', 1)
1580 _start_creativeCommons_license = _start_creativecommons_license
1581
1582 def _end_creativecommons_license(self):
1583 value = self.pop('license')
1584 context = self._getContext()
1585 attrsD = FeedParserDict()
1586 attrsD['rel'] = u'license'
1587 if value:
1588 attrsD['href'] = value
1589 context.setdefault('links', []).append(attrsD)
1590 del context['license']
1591 _end_creativeCommons_license = _end_creativecommons_license
1592
1593 def _addTag(self, term, scheme, label):
1594 context = self._getContext()
1595 tags = context.setdefault('tags', [])
1596 if (not term) and (not scheme) and (not label):
1597 return
1598 value = FeedParserDict(term=term, scheme=scheme, label=label)
1599 if value not in tags:
1600 tags.append(value)
1601
1602 def _start_tags(self, attrsD):
1603 # This is a completely-made up element. Its semantics are determined
1604 # only by a single feed that precipitated bug report 392 on Google Code.
1605 # In short, this is junk code.
1606 self.push('tags', 1)
1607
1608 def _end_tags(self):
1609 for term in self.pop('tags').split(','):
1610 self._addTag(term.strip(), None, None)
1611
1612 def _start_category(self, attrsD):
1613 term = attrsD.get('term')
1614 scheme = attrsD.get('scheme', attrsD.get('domain'))
1615 label = attrsD.get('label')
1616 self._addTag(term, scheme, label)
1617 self.push('category', 1)
1618 _start_dc_subject = _start_category
1619 _start_keywords = _start_category
1620
1621 def _start_media_category(self, attrsD):
1622 attrsD.setdefault('scheme', u'http://search.yahoo.com/mrss/category_schema')
1623 self._start_category(attrsD)
1624
1625 def _end_itunes_keywords(self):
1626 for term in self.pop('itunes_keywords').split(','):
1627 if term.strip():
1628 self._addTag(term.strip(), u'http://www.itunes.com/', None)
1629
1630 def _end_media_keywords(self):
1631 for term in self.pop('media_keywords').split(','):
1632 if term.strip():
1633 self._addTag(term.strip(), None, None)
1634
1635 def _start_itunes_category(self, attrsD):
1636 self._addTag(attrsD.get('text'), u'http://www.itunes.com/', None)
1637 self.push('category', 1)
1638
1639 def _end_category(self):
1640 value = self.pop('category')
1641 if not value:
1642 return
1643 context = self._getContext()
1644 tags = context['tags']
1645 if value and len(tags) and not tags[-1]['term']:
1646 tags[-1]['term'] = value
1647 else:
1648 self._addTag(value, None, None)
1649 _end_dc_subject = _end_category
1650 _end_keywords = _end_category
1651 _end_itunes_category = _end_category
1652 _end_media_category = _end_category
1653
1654 def _start_cloud(self, attrsD):
1655 self._getContext()['cloud'] = FeedParserDict(attrsD)
1656
1657 def _start_link(self, attrsD):
1658 attrsD.setdefault('rel', u'alternate')
1659 if attrsD['rel'] == u'self':
1660 attrsD.setdefault('type', u'application/atom+xml')
1661 else:
1662 attrsD.setdefault('type', u'text/html')
1663 context = self._getContext()
1664 attrsD = self._itsAnHrefDamnIt(attrsD)
1665 if 'href' in attrsD:
1666 attrsD['href'] = self.resolveURI(attrsD['href'])
1667 expectingText = self.infeed or self.inentry or self.insource
1668 context.setdefault('links', [])
1669 if not (self.inentry and self.inimage):
1670 context['links'].append(FeedParserDict(attrsD))
1671 if 'href' in attrsD:
1672 expectingText = 0
1673 if (attrsD.get('rel') == u'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types):
1674 context['link'] = attrsD['href']
1675 else:
1676 self.push('link', expectingText)
1677
1678 def _end_link(self):
1679 value = self.pop('link')
1680
1681 def _start_guid(self, attrsD):
1682 self.guidislink = (attrsD.get('ispermalink', 'true') == 'true')
1683 self.push('id', 1)
1684 _start_id = _start_guid
1685
1686 def _end_guid(self):
1687 value = self.pop('id')
1688 self._save('guidislink', self.guidislink and 'link' not in self._getContext())
1689 if self.guidislink:
1690 # guid acts as link, but only if 'ispermalink' is not present or is 'true',
1691 # and only if the item doesn't already have a link element
1692 self._save('link', value)
1693 _end_id = _end_guid
1694
1695 def _start_title(self, attrsD):
1696 if self.svgOK:
1697 return self.unknown_starttag('title', attrsD.items())
1698 self.pushContent('title', attrsD, u'text/plain', self.infeed or self.inentry or self.insource)
1699 _start_dc_title = _start_title
1700 _start_media_title = _start_title
1701
1702 def _end_title(self):
1703 if self.svgOK:
1704 return
1705 value = self.popContent('title')
1706 if not value:
1707 return
1708 self.title_depth = self.depth
1709 _end_dc_title = _end_title
1710
1711 def _end_media_title(self):
1712 title_depth = self.title_depth
1713 self._end_title()
1714 self.title_depth = title_depth
1715
1716 def _start_description(self, attrsD):
1717 context = self._getContext()
1718 if 'summary' in context:
1719 self._summaryKey = 'content'
1720 self._start_content(attrsD)
1721 else:
1722 self.pushContent('description', attrsD, u'text/html', self.infeed or self.inentry or self.insource)
1723 _start_dc_description = _start_description
1724 _start_media_description = _start_description
1725
1726 def _start_abstract(self, attrsD):
1727 self.pushContent('description', attrsD, u'text/plain', self.infeed or self.inentry or self.insource)
1728
1729 def _end_description(self):
1730 if self._summaryKey == 'content':
1731 self._end_content()
1732 else:
1733 value = self.popContent('description')
1734 self._summaryKey = None
1735 _end_abstract = _end_description
1736 _end_dc_description = _end_description
1737 _end_media_description = _end_description
1738
1739 def _start_info(self, attrsD):
1740 self.pushContent('info', attrsD, u'text/plain', 1)
1741 _start_feedburner_browserfriendly = _start_info
1742
1743 def _end_info(self):
1744 self.popContent('info')
1745 _end_feedburner_browserfriendly = _end_info
1746
1747 def _start_generator(self, attrsD):
1748 if attrsD:
1749 attrsD = self._itsAnHrefDamnIt(attrsD)
1750 if 'href' in attrsD:
1751 attrsD['href'] = self.resolveURI(attrsD['href'])
1752 self._getContext()['generator_detail'] = FeedParserDict(attrsD)
1753 self.push('generator', 1)
1754
1755 def _end_generator(self):
1756 value = self.pop('generator')
1757 context = self._getContext()
1758 if 'generator_detail' in context:
1759 context['generator_detail']['name'] = value
1760
1761 def _start_admin_generatoragent(self, attrsD):
1762 self.push('generator', 1)
1763 value = self._getAttribute(attrsD, 'rdf:resource')
1764 if value:
1765 self.elementstack[-1][2].append(value)
1766 self.pop('generator')
1767 self._getContext()['generator_detail'] = FeedParserDict({'href': value})
1768
1769 def _start_admin_errorreportsto(self, attrsD):
1770 self.push('errorreportsto', 1)
1771 value = self._getAttribute(attrsD, 'rdf:resource')
1772 if value:
1773 self.elementstack[-1][2].append(value)
1774 self.pop('errorreportsto')
1775
1776 def _start_summary(self, attrsD):
1777 context = self._getContext()
1778 if 'summary' in context:
1779 self._summaryKey = 'content'
1780 self._start_content(attrsD)
1781 else:
1782 self._summaryKey = 'summary'
1783 self.pushContent(self._summaryKey, attrsD, u'text/plain', 1)
1784 _start_itunes_summary = _start_summary
1785
1786 def _end_summary(self):
1787 if self._summaryKey == 'content':
1788 self._end_content()
1789 else:
1790 self.popContent(self._summaryKey or 'summary')
1791 self._summaryKey = None
1792 _end_itunes_summary = _end_summary
1793
1794 def _start_enclosure(self, attrsD):
1795 attrsD = self._itsAnHrefDamnIt(attrsD)
1796 context = self._getContext()
1797 attrsD['rel'] = u'enclosure'
1798 context.setdefault('links', []).append(FeedParserDict(attrsD))
1799
1800 def _start_source(self, attrsD):
1801 if 'url' in attrsD:
1802 # This means that we're processing a source element from an RSS 2.0 feed
1803 self.sourcedata['href'] = attrsD[u'url']
1804 self.push('source', 1)
1805 self.insource = 1
1806 self.title_depth = -1
1807
1808 def _end_source(self):
1809 self.insource = 0
1810 value = self.pop('source')
1811 if value:
1812 self.sourcedata['title'] = value
1813 self._getContext()['source'] = copy.deepcopy(self.sourcedata)
1814 self.sourcedata.clear()
1815
1816 def _start_content(self, attrsD):
1817 self.pushContent('content', attrsD, u'text/plain', 1)
1818 src = attrsD.get('src')
1819 if src:
1820 self.contentparams['src'] = src
1821 self.push('content', 1)
1822
1823 def _start_body(self, attrsD):
1824 self.pushContent('content', attrsD, u'application/xhtml+xml', 1)
1825 _start_xhtml_body = _start_body
1826
1827 def _start_content_encoded(self, attrsD):
1828 self.pushContent('content', attrsD, u'text/html', 1)
1829 _start_fullitem = _start_content_encoded
1830
1831 def _end_content(self):
1832 copyToSummary = self.mapContentType(self.contentparams.get('type')) in ([u'text/plain'] + self.html_types)
1833 value = self.popContent('content')
1834 if copyToSummary:
1835 self._save('summary', value)
1836
1837 _end_body = _end_content
1838 _end_xhtml_body = _end_content
1839 _end_content_encoded = _end_content
1840 _end_fullitem = _end_content
1841
1842 def _start_itunes_image(self, attrsD):
1843 self.push('itunes_image', 0)
1844 if attrsD.get('href'):
1845 self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')})
1846 elif attrsD.get('url'):
1847 self._getContext()['image'] = FeedParserDict({'href': attrsD.get('url')})
1848 _start_itunes_link = _start_itunes_image
1849
1850 def _end_itunes_block(self):
1851 value = self.pop('itunes_block', 0)
1852 self._getContext()['itunes_block'] = (value == 'yes') and 1 or 0
1853
1854 def _end_itunes_explicit(self):
1855 value = self.pop('itunes_explicit', 0)
1856 # Convert 'yes' -> True, 'clean' to False, and any other value to None
1857 # False and None both evaluate as False, so the difference can be ignored
1858 # by applications that only need to know if the content is explicit.
1859 self._getContext()['itunes_explicit'] = (None, False, True)[(value == 'yes' and 2) or value == 'clean' or 0]
1860
1861 def _start_media_group(self, attrsD):
1862 # don't do anything, but don't break the enclosed tags either
1863 pass
1864
1865 def _start_media_rating(self, attrsD):
1866 context = self._getContext()
1867 context.setdefault('media_rating', attrsD)
1868 self.push('rating', 1)
1869
1870 def _end_media_rating(self):
1871 rating = self.pop('rating')
1872 if rating is not None and rating.strip():
1873 context = self._getContext()
1874 context['media_rating']['content'] = rating
1875
1876 def _start_media_credit(self, attrsD):
1877 context = self._getContext()
1878 context.setdefault('media_credit', [])
1879 context['media_credit'].append(attrsD)
1880 self.push('credit', 1)
1881
1882 def _end_media_credit(self):
1883 credit = self.pop('credit')
1884 if credit != None and len(credit.strip()) != 0:
1885 context = self._getContext()
1886 context['media_credit'][-1]['content'] = credit
1887
1888 def _start_media_restriction(self, attrsD):
1889 context = self._getContext()
1890 context.setdefault('media_restriction', attrsD)
1891 self.push('restriction', 1)
1892
1893 def _end_media_restriction(self):
1894 restriction = self.pop('restriction')
1895 if restriction != None and len(restriction.strip()) != 0:
1896 context = self._getContext()
1897 context['media_restriction']['content'] = [cc.strip().lower() for cc in restriction.split(' ')]
1898
1899 def _start_media_license(self, attrsD):
1900 context = self._getContext()
1901 context.setdefault('media_license', attrsD)
1902 self.push('license', 1)
1903
1904 def _end_media_license(self):
1905 license = self.pop('license')
1906 if license != None and len(license.strip()) != 0:
1907 context = self._getContext()
1908 context['media_license']['content'] = license
1909
1910 def _start_media_content(self, attrsD):
1911 context = self._getContext()
1912 context.setdefault('media_content', [])
1913 context['media_content'].append(attrsD)
1914
1915 def _start_media_thumbnail(self, attrsD):
1916 context = self._getContext()
1917 context.setdefault('media_thumbnail', [])
1918 self.push('url', 1) # new
1919 context['media_thumbnail'].append(attrsD)
1920
1921 def _end_media_thumbnail(self):
1922 url = self.pop('url')
1923 context = self._getContext()
1924 if url != None and len(url.strip()) != 0:
1925 if 'url' not in context['media_thumbnail'][-1]:
1926 context['media_thumbnail'][-1]['url'] = url
1927
1928 def _start_media_player(self, attrsD):
1929 self.push('media_player', 0)
1930 self._getContext()['media_player'] = FeedParserDict(attrsD)
1931
1932 def _end_media_player(self):
1933 value = self.pop('media_player')
1934 context = self._getContext()
1935 context['media_player']['content'] = value
1936
1937 def _start_newlocation(self, attrsD):
1938 self.push('newlocation', 1)
1939
1940 def _end_newlocation(self):
1941 url = self.pop('newlocation')
1942 context = self._getContext()
1943 # don't set newlocation if the context isn't right
1944 if context is not self.feeddata:
1945 return
1946 context['newlocation'] = _makeSafeAbsoluteURI(self.baseuri, url.strip())
1947
1948 def _start_psc_chapters(self, attrsD):
1949 if self.psc_chapters_flag is None:
1950 # Transition from None -> True
1951 self.psc_chapters_flag = True
1952 attrsD['chapters'] = []
1953 self._getContext()['psc_chapters'] = FeedParserDict(attrsD)
1954
1955 def _end_psc_chapters(self):
1956 # Transition from True -> False
1957 self.psc_chapters_flag = False
1958
1959 def _start_psc_chapter(self, attrsD):
1960 if self.psc_chapters_flag:
1961 start = self._getAttribute(attrsD, 'start')
1962 attrsD['start_parsed'] = _parse_psc_chapter_start(start)
1963
1964 context = self._getContext()['psc_chapters']
1965 context['chapters'].append(FeedParserDict(attrsD))
1966
1967
1968 if _XML_AVAILABLE:
1969 class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler):
1970 def __init__(self, baseuri, baselang, encoding):
1971 xml.sax.handler.ContentHandler.__init__(self)
1972 _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
1973 self.bozo = 0
1974 self.exc = None
1975 self.decls = {}
1976
1977 def startPrefixMapping(self, prefix, uri):
1978 if not uri:
1979 return
1980 # Jython uses '' instead of None; standardize on None
1981 prefix = prefix or None
1982 self.trackNamespace(prefix, uri)
1983 if prefix and uri == 'http://www.w3.org/1999/xlink':
1984 self.decls['xmlns:' + prefix] = uri
1985
1986 def startElementNS(self, name, qname, attrs):
1987 namespace, localname = name
1988 lowernamespace = str(namespace or '').lower()
1989 if lowernamespace.find(u'backend.userland.com/rss') <> -1:
1990 # match any backend.userland.com namespace
1991 namespace = u'http://backend.userland.com/rss'
1992 lowernamespace = namespace
1993 if qname and qname.find(':') > 0:
1994 givenprefix = qname.split(':')[0]
1995 else:
1996 givenprefix = None
1997 prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
1998 if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and givenprefix not in self.namespacesInUse:
1999 raise UndeclaredNamespace, "'%s' is not associated with a namespace" % givenprefix
2000 localname = str(localname).lower()
2001
2002 # qname implementation is horribly broken in Python 2.1 (it
2003 # doesn't report any), and slightly broken in Python 2.2 (it
2004 # doesn't report the xml: namespace). So we match up namespaces
2005 # with a known list first, and then possibly override them with
2006 # the qnames the SAX parser gives us (if indeed it gives us any
2007 # at all). Thanks to MatejC for helping me test this and
2008 # tirelessly telling me that it didn't work yet.
2009 attrsD, self.decls = self.decls, {}
2010 if localname=='math' and namespace=='http://www.w3.org/1998/Math/MathML':
2011 attrsD['xmlns']=namespace
2012 if localname=='svg' and namespace=='http://www.w3.org/2000/svg':
2013 attrsD['xmlns']=namespace
2014
2015 if prefix:
2016 localname = prefix.lower() + ':' + localname
2017 elif namespace and not qname: #Expat
2018 for name,value in self.namespacesInUse.items():
2019 if name and value == namespace:
2020 localname = name + ':' + localname
2021 break
2022
2023 for (namespace, attrlocalname), attrvalue in attrs.items():
2024 lowernamespace = (namespace or '').lower()
2025 prefix = self._matchnamespaces.get(lowernamespace, '')
2026 if prefix:
2027 attrlocalname = prefix + ':' + attrlocalname
2028 attrsD[str(attrlocalname).lower()] = attrvalue
2029 for qname in attrs.getQNames():
2030 attrsD[str(qname).lower()] = attrs.getValueByQName(qname)
2031 localname = str(localname).lower()
2032 self.unknown_starttag(localname, attrsD.items())
2033
2034 def characters(self, text):
2035 self.handle_data(text)
2036
2037 def endElementNS(self, name, qname):
2038 namespace, localname = name
2039 lowernamespace = str(namespace or '').lower()
2040 if qname and qname.find(':') > 0:
2041 givenprefix = qname.split(':')[0]
2042 else:
2043 givenprefix = ''
2044 prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
2045 if prefix:
2046 localname = prefix + ':' + localname
2047 elif namespace and not qname: #Expat
2048 for name,value in self.namespacesInUse.items():
2049 if name and value == namespace:
2050 localname = name + ':' + localname
2051 break
2052 localname = str(localname).lower()
2053 self.unknown_endtag(localname)
2054
2055 def error(self, exc):
2056 self.bozo = 1
2057 self.exc = exc
2058
2059 # drv_libxml2 calls warning() in some cases
2060 warning = error
2061
2062 def fatalError(self, exc):
2063 self.error(exc)
2064 raise exc
2065
2066 class _BaseHTMLProcessor(sgmllib.SGMLParser):
2067 special = re.compile('''[<>'"]''')
2068 bare_ampersand = re.compile("&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)")
2069 elements_no_end_tag = set([
2070 'area', 'base', 'basefont', 'br', 'col', 'command', 'embed', 'frame',
2071 'hr', 'img', 'input', 'isindex', 'keygen', 'link', 'meta', 'param',
2072 'source', 'track', 'wbr'
2073 ])
2074
2075 def __init__(self, encoding, _type):
2076 self.encoding = encoding
2077 self._type = _type
2078 sgmllib.SGMLParser.__init__(self)
2079
2080 def reset(self):
2081 self.pieces = []
2082 sgmllib.SGMLParser.reset(self)
2083
2084 def _shorttag_replace(self, match):
2085 tag = match.group(1)
2086 if tag in self.elements_no_end_tag:
2087 return '<' + tag + ' />'
2088 else:
2089 return '<' + tag + '></' + tag + '>'
2090
2091 # By declaring these methods and overriding their compiled code
2092 # with the code from sgmllib, the original code will execute in
2093 # feedparser's scope instead of sgmllib's. This means that the
2094 # `tagfind` and `charref` regular expressions will be found as
2095 # they're declared above, not as they're declared in sgmllib.
2096 def goahead(self, i):
2097 pass
2098 goahead.func_code = sgmllib.SGMLParser.goahead.func_code
2099
2100 def __parse_starttag(self, i):
2101 pass
2102 __parse_starttag.func_code = sgmllib.SGMLParser.parse_starttag.func_code
2103
2104 def parse_starttag(self,i):
2105 j = self.__parse_starttag(i)
2106 if self._type == 'application/xhtml+xml':
2107 if j>2 and self.rawdata[j-2:j]=='/>':
2108 self.unknown_endtag(self.lasttag)
2109 return j
2110
2111 def feed(self, data):
2112 data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'<!\1', data)
2113 data = re.sub(r'<([^<>\s]+?)\s*/>', self._shorttag_replace, data)
2114 data = data.replace(''', "'")
2115 data = data.replace('"', '"')
2116 try:
2117 bytes
2118 if bytes is str:
2119 raise NameError
2120 self.encoding = self.encoding + u'_INVALID_PYTHON_3'
2121 except NameError:
2122 if self.encoding and isinstance(data, unicode):
2123 data = data.encode(self.encoding)
2124 sgmllib.SGMLParser.feed(self, data)
2125 sgmllib.SGMLParser.close(self)
2126
2127 def normalize_attrs(self, attrs):
2128 if not attrs:
2129 return attrs
2130 # utility method to be called by descendants
2131 attrs = dict([(k.lower(), v) for k, v in attrs]).items()
2132 attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
2133 attrs.sort()
2134 return attrs
2135
2136 def unknown_starttag(self, tag, attrs):
2137 # called for each start tag
2138 # attrs is a list of (attr, value) tuples
2139 # e.g. for <pre class='screen'>, tag='pre', attrs=[('class', 'screen')]
2140 uattrs = []
2141 strattrs=''
2142 if attrs:
2143 for key, value in attrs:
2144 value=value.replace('>','>').replace('<','<').replace('"','"')
2145 value = self.bare_ampersand.sub("&", value)
2146 # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds
2147 if not isinstance(value, unicode):
2148 value = value.decode(self.encoding, 'ignore')
2149 try:
2150 # Currently, in Python 3 the key is already a str, and cannot be decoded again
2151 uattrs.append((unicode(key, self.encoding), value))
2152 except TypeError:
2153 uattrs.append((key, value))
2154 strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs])
2155 if self.encoding:
2156 try:
2157 strattrs = strattrs.encode(self.encoding)
2158 except (UnicodeEncodeError, LookupError):
2159 pass
2160 if tag in self.elements_no_end_tag:
2161 self.pieces.append('<%s%s />' % (tag, strattrs))
2162 else:
2163 self.pieces.append('<%s%s>' % (tag, strattrs))
2164
2165 def unknown_endtag(self, tag):
2166 # called for each end tag, e.g. for </pre>, tag will be 'pre'
2167 # Reconstruct the original end tag.
2168 if tag not in self.elements_no_end_tag:
2169 self.pieces.append("</%s>" % tag)
2170
2171 def handle_charref(self, ref):
2172 # called for each character reference, e.g. for ' ', ref will be '160'
2173 # Reconstruct the original character reference.
2174 ref = ref.lower()
2175 if ref.startswith('x'):
2176 value = int(ref[1:], 16)
2177 else:
2178 value = int(ref)
2179
2180 if value in _cp1252:
2181 self.pieces.append('&#%s;' % hex(ord(_cp1252[value]))[1:])
2182 else:
2183 self.pieces.append('&#%s;' % ref)
2184
2185 def handle_entityref(self, ref):
2186 # called for each entity reference, e.g. for '©', ref will be 'copy'
2187 # Reconstruct the original entity reference.
2188 if ref in name2codepoint or ref == 'apos':
2189 self.pieces.append('&%s;' % ref)
2190 else:
2191 self.pieces.append('&%s' % ref)
2192
2193 def handle_data(self, text):
2194 # called for each block of plain text, i.e. outside of any tag and
2195 # not containing any character or entity references
2196 # Store the original text verbatim.
2197 self.pieces.append(text)
2198
2199 def handle_comment(self, text):
2200 # called for each HTML comment, e.g. <!-- insert Javascript code here -->
2201 # Reconstruct the original comment.
2202 self.pieces.append('<!--%s-->' % text)
2203
2204 def handle_pi(self, text):
2205 # called for each processing instruction, e.g. <?instruction>
2206 # Reconstruct original processing instruction.
2207 self.pieces.append('<?%s>' % text)
2208
2209 def handle_decl(self, text):
2210 # called for the DOCTYPE, if present, e.g.
2211 # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
2212 # "http://www.w3.org/TR/html4/loose.dtd">
2213 # Reconstruct original DOCTYPE
2214 self.pieces.append('<!%s>' % text)
2215
2216 _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
2217 def _scan_name(self, i, declstartpos):
2218 rawdata = self.rawdata
2219 n = len(rawdata)
2220 if i == n:
2221 return None, -1
2222 m = self._new_declname_match(rawdata, i)
2223 if m:
2224 s = m.group()
2225 name = s.strip()
2226 if (i + len(s)) == n:
2227 return None, -1 # end of buffer
2228 return name.lower(), m.end()
2229 else:
2230 self.handle_data(rawdata)
2231 # self.updatepos(declstartpos, i)
2232 return None, -1
2233
2234 def convert_charref(self, name):
2235 return '&#%s;' % name
2236
2237 def convert_entityref(self, name):
2238 return '&%s;' % name
2239
2240 def output(self):
2241 '''Return processed HTML as a single string'''
2242 return ''.join([str(p) for p in self.pieces])
2243
2244 def parse_declaration(self, i):
2245 try:
2246 return sgmllib.SGMLParser.parse_declaration(self, i)
2247 except sgmllib.SGMLParseError:
2248 # escape the doctype declaration and continue parsing
2249 self.handle_data('<')
2250 return i+1
2251
2252 class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor):
2253 def __init__(self, baseuri, baselang, encoding, entities):
2254 sgmllib.SGMLParser.__init__(self)
2255 _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
2256 _BaseHTMLProcessor.__init__(self, encoding, 'application/xhtml+xml')
2257 self.entities=entities
2258
2259 def decodeEntities(self, element, data):
2260 data = data.replace('<', '<')
2261 data = data.replace('<', '<')
2262 data = data.replace('<', '<')
2263 data = data.replace('>', '>')
2264 data = data.replace('>', '>')
2265 data = data.replace('>', '>')
2266 data = data.replace('&', '&')
2267 data = data.replace('&', '&')
2268 data = data.replace('"', '"')
2269 data = data.replace('"', '"')
2270 data = data.replace(''', ''')
2271 data = data.replace(''', ''')
2272 if not self.contentparams.get('type', u'xml').endswith(u'xml'):
2273 data = data.replace('<', '<')
2274 data = data.replace('>', '>')
2275 data = data.replace('&', '&')
2276 data = data.replace('"', '"')
2277 data = data.replace(''', "'")
2278 data = data.replace('/', '/')
2279 data = data.replace('/', '/')
2280 return data
2281
2282 def strattrs(self, attrs):
2283 return ''.join([' %s="%s"' % (n,v.replace('"','"')) for n,v in attrs])
2284
2285 class _RelativeURIResolver(_BaseHTMLProcessor):
2286 relative_uris = set([('a', 'href'),
2287 ('applet', 'codebase'),
2288 ('area', 'href'),
2289 ('audio', 'src'),
2290 ('blockquote', 'cite'),
2291 ('body', 'background'),
2292 ('del', 'cite'),
2293 ('form', 'action'),
2294 ('frame', 'longdesc'),
2295 ('frame', 'src'),
2296 ('iframe', 'longdesc'),
2297 ('iframe', 'src'),
2298 ('head', 'profile'),
2299 ('img', 'longdesc'),
2300 ('img', 'src'),
2301 ('img', 'usemap'),
2302 ('input', 'src'),
2303 ('input', 'usemap'),
2304 ('ins', 'cite'),
2305 ('link', 'href'),
2306 ('object', 'classid'),
2307 ('object', 'codebase'),
2308 ('object', 'data'),
2309 ('object', 'usemap'),
2310 ('q', 'cite'),
2311 ('script', 'src'),
2312 ('source', 'src'),
2313 ('video', 'poster'),
2314 ('video', 'src')])
2315
2316 def __init__(self, baseuri, encoding, _type):
2317 _BaseHTMLProcessor.__init__(self, encoding, _type)
2318 self.baseuri = baseuri
2319
2320 def resolveURI(self, uri):
2321 return _makeSafeAbsoluteURI(self.baseuri, uri.strip())
2322
2323 def unknown_starttag(self, tag, attrs):
2324 attrs = self.normalize_attrs(attrs)
2325 attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]
2326 _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
2327
2328 def _resolveRelativeURIs(htmlSource, baseURI, encoding, _type):
2329 if not _SGML_AVAILABLE:
2330 return htmlSource
2331
2332 p = _RelativeURIResolver(baseURI, encoding, _type)
2333 p.feed(htmlSource)
2334 return p.output()
2335
2336 def _makeSafeAbsoluteURI(base, rel=None):
2337 # bail if ACCEPTABLE_URI_SCHEMES is empty
2338 if not ACCEPTABLE_URI_SCHEMES:
2339 return _urljoin(base, rel or u'')
2340 if not base:
2341 return rel or u''
2342 if not rel:
2343 try:
2344 scheme = urlparse.urlparse(base)[0]
2345 except ValueError:
2346 return u''
2347 if not scheme or scheme in ACCEPTABLE_URI_SCHEMES:
2348 return base
2349 return u''
2350 uri = _urljoin(base, rel)
2351 if uri.strip().split(':', 1)[0] not in ACCEPTABLE_URI_SCHEMES:
2352 return u''
2353 return uri
2354
2355 class _HTMLSanitizer(_BaseHTMLProcessor):
2356 acceptable_elements = set(['a', 'abbr', 'acronym', 'address', 'area',
2357 'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button',
2358 'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',
2359 'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn',
2360 'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset',
2361 'figcaption', 'figure', 'footer', 'font', 'form', 'header', 'h1',
2362 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins',
2363 'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter',
2364 'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option',
2365 'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
2366 'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',
2367 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',
2368 'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video', 'noscript'])
2369
2370 acceptable_attributes = set(['abbr', 'accept', 'accept-charset', 'accesskey',
2371 'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis',
2372 'background', 'balance', 'bgcolor', 'bgproperties', 'border',
2373 'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding',
2374 'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff',
2375 'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color', 'cols',
2376 'colspan', 'compact', 'contenteditable', 'controls', 'coords', 'data',
2377 'datafld', 'datapagesize', 'datasrc', 'datetime', 'default', 'delay',
2378 'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end', 'face', 'for',
2379 'form', 'frame', 'galleryimg', 'gutter', 'headers', 'height', 'hidefocus',
2380 'hidden', 'high', 'href', 'hreflang', 'hspace', 'icon', 'id', 'inputmode',
2381 'ismap', 'keytype', 'label', 'leftspacing', 'lang', 'list', 'longdesc',
2382 'loop', 'loopcount', 'loopend', 'loopstart', 'low', 'lowsrc', 'max',
2383 'maxlength', 'media', 'method', 'min', 'multiple', 'name', 'nohref',
2384 'noshade', 'nowrap', 'open', 'optimum', 'pattern', 'ping', 'point-size',
2385 'poster', 'pqg', 'preload', 'prompt', 'radiogroup', 'readonly', 'rel',
2386 'repeat-max', 'repeat-min', 'replace', 'required', 'rev', 'rightspacing',
2387 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', 'span',
2388 'src', 'start', 'step', 'summary', 'suppress', 'tabindex', 'target',
2389 'template', 'title', 'toppadding', 'type', 'unselectable', 'usemap',
2390 'urn', 'valign', 'value', 'variable', 'volume', 'vspace', 'vrml',
2391 'width', 'wrap', 'xml:lang'])
2392
2393 unacceptable_elements_with_end_tag = set(['script', 'applet', 'style'])
2394
2395 acceptable_css_properties = set(['azimuth', 'background-color',
2396 'border-bottom-color', 'border-collapse', 'border-color',
2397 'border-left-color', 'border-right-color', 'border-top-color', 'clear',
2398 'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
2399 'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
2400 'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
2401 'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
2402 'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
2403 'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
2404 'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
2405 'white-space', 'width'])
2406
2407 # survey of common keywords found in feeds
2408 acceptable_css_keywords = set(['auto', 'aqua', 'black', 'block', 'blue',
2409 'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
2410 'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
2411 'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
2412 'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
2413 'transparent', 'underline', 'white', 'yellow'])
2414
2415 valid_css_values = re.compile('^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|' +
2416 '\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$')
2417
2418 mathml_elements = set([
2419 'annotation',
2420 'annotation-xml',
2421 'maction',
2422 'maligngroup',
2423 'malignmark',
2424 'math',
2425 'menclose',
2426 'merror',
2427 'mfenced',
2428 'mfrac',
2429 'mglyph',
2430 'mi',
2431 'mlabeledtr',
2432 'mlongdiv',
2433 'mmultiscripts',
2434 'mn',
2435 'mo',
2436 'mover',
2437 'mpadded',
2438 'mphantom',
2439 'mprescripts',
2440 'mroot',
2441 'mrow',
2442 'ms',
2443 'mscarries',
2444 'mscarry',
2445 'msgroup',
2446 'msline',
2447 'mspace',
2448 'msqrt',
2449 'msrow',
2450 'mstack',
2451 'mstyle',
2452 'msub',
2453 'msubsup',
2454 'msup',
2455 'mtable',
2456 'mtd',
2457 'mtext',
2458 'mtr',
2459 'munder',
2460 'munderover',
2461 'none',
2462 'semantics',
2463 ])
2464
2465 mathml_attributes = set([
2466 'accent',
2467 'accentunder',
2468 'actiontype',
2469 'align',
2470 'alignmentscope',
2471 'altimg',
2472 'altimg-height',
2473 'altimg-valign',
2474 'altimg-width',
2475 'alttext',
2476 'bevelled',
2477 'charalign',
2478 'close',
2479 'columnalign',
2480 'columnlines',
2481 'columnspacing',
2482 'columnspan',
2483 'columnwidth',
2484 'crossout',
2485 'decimalpoint',
2486 'denomalign',
2487 'depth',
2488 'dir',
2489 'display',
2490 'displaystyle',
2491 'edge',
2492 'encoding',
2493 'equalcolumns',
2494 'equalrows',
2495 'fence',
2496 'fontstyle',
2497 'fontweight',
2498 'form',
2499 'frame',
2500 'framespacing',
2501 'groupalign',
2502 'height',
2503 'href',
2504 'id',
2505 'indentalign',
2506 'indentalignfirst',
2507 'indentalignlast',
2508 'indentshift',
2509 'indentshiftfirst',
2510 'indentshiftlast',
2511 'indenttarget',
2512 'infixlinebreakstyle',
2513 'largeop',
2514 'length',
2515 'linebreak',
2516 'linebreakmultchar',
2517 'linebreakstyle',
2518 'lineleading',
2519 'linethickness',
2520 'location',
2521 'longdivstyle',
2522 'lquote',
2523 'lspace',
2524 'mathbackground',
2525 'mathcolor',
2526 'mathsize',
2527 'mathvariant',
2528 'maxsize',
2529 'minlabelspacing',
2530 'minsize',
2531 'movablelimits',
2532 'notation',
2533 'numalign',
2534 'open',
2535 'other',
2536 'overflow',
2537 'position',
2538 'rowalign',
2539 'rowlines',
2540 'rowspacing',
2541 'rowspan',
2542 'rquote',
2543 'rspace',
2544 'scriptlevel',
2545 'scriptminsize',
2546 'scriptsizemultiplier',
2547 'selection',
2548 'separator',
2549 'separators',
2550 'shift',
2551 'side',
2552 'src',
2553 'stackalign',
2554 'stretchy',
2555 'subscriptshift',
2556 'superscriptshift',
2557 'symmetric',
2558 'voffset',
2559 'width',
2560 'xlink:href',
2561 'xlink:show',
2562 'xlink:type',
2563 'xmlns',
2564 'xmlns:xlink',
2565 ])
2566
2567 # svgtiny - foreignObject + linearGradient + radialGradient + stop
2568 svg_elements = set(['a', 'animate', 'animateColor', 'animateMotion',
2569 'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'foreignObject',
2570 'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern',
2571 'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph', 'mpath',
2572 'path', 'polygon', 'polyline', 'radialGradient', 'rect', 'set', 'stop',
2573 'svg', 'switch', 'text', 'title', 'tspan', 'use'])
2574
2575 # svgtiny + class + opacity + offset + xmlns + xmlns:xlink
2576 svg_attributes = set(['accent-height', 'accumulate', 'additive', 'alphabetic',
2577 'arabic-form', 'ascent', 'attributeName', 'attributeType',
2578 'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
2579 'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx',
2580 'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-opacity',
2581 'fill-rule', 'font-family', 'font-size', 'font-stretch', 'font-style',
2582 'font-variant', 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2',
2583 'glyph-name', 'gradientUnits', 'hanging', 'height', 'horiz-adv-x',
2584 'horiz-origin-x', 'id', 'ideographic', 'k', 'keyPoints', 'keySplines',
2585 'keyTimes', 'lang', 'mathematical', 'marker-end', 'marker-mid',
2586 'marker-start', 'markerHeight', 'markerUnits', 'markerWidth', 'max',
2587 'min', 'name', 'offset', 'opacity', 'orient', 'origin',
2588 'overline-position', 'overline-thickness', 'panose-1', 'path',
2589 'pathLength', 'points', 'preserveAspectRatio', 'r', 'refX', 'refY',
2590 'repeatCount', 'repeatDur', 'requiredExtensions', 'requiredFeatures',
2591 'restart', 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv',
2592 'stop-color', 'stop-opacity', 'strikethrough-position',
2593 'strikethrough-thickness', 'stroke', 'stroke-dasharray',
2594 'stroke-dashoffset', 'stroke-linecap', 'stroke-linejoin',
2595 'stroke-miterlimit', 'stroke-opacity', 'stroke-width', 'systemLanguage',
2596 'target', 'text-anchor', 'to', 'transform', 'type', 'u1', 'u2',
2597 'underline-position', 'underline-thickness', 'unicode', 'unicode-range',
2598 'units-per-em', 'values', 'version', 'viewBox', 'visibility', 'width',
2599 'widths', 'x', 'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
2600 'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type',
2601 'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1',
2602 'y2', 'zoomAndPan'])
2603
2604 svg_attr_map = None
2605 svg_elem_map = None
2606
2607 acceptable_svg_properties = set([ 'fill', 'fill-opacity', 'fill-rule',
2608 'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
2609 'stroke-opacity'])
2610
2611 def reset(self):
2612 _BaseHTMLProcessor.reset(self)
2613 self.unacceptablestack = 0
2614 self.mathmlOK = 0
2615 self.svgOK = 0
2616
2617 def unknown_starttag(self, tag, attrs):
2618 acceptable_attributes = self.acceptable_attributes
2619 keymap = {}
2620 if not tag in self.acceptable_elements or self.svgOK:
2621 if tag in self.unacceptable_elements_with_end_tag:
2622 self.unacceptablestack += 1
2623
2624 # add implicit namespaces to html5 inline svg/mathml
2625 if self._type.endswith('html'):
2626 if not dict(attrs).get('xmlns'):
2627 if tag=='svg':
2628 attrs.append( ('xmlns','http://www.w3.org/2000/svg') )
2629 if tag=='math':
2630 attrs.append( ('xmlns','http://www.w3.org/1998/Math/MathML') )
2631
2632 # not otherwise acceptable, perhaps it is MathML or SVG?
2633 if tag=='math' and ('xmlns','http://www.w3.org/1998/Math/MathML') in attrs:
2634 self.mathmlOK += 1
2635 if tag=='svg' and ('xmlns','http://www.w3.org/2000/svg') in attrs:
2636 self.svgOK += 1
2637
2638 # chose acceptable attributes based on tag class, else bail
2639 if self.mathmlOK and tag in self.mathml_elements:
2640 acceptable_attributes = self.mathml_attributes
2641 elif self.svgOK and tag in self.svg_elements:
2642 # for most vocabularies, lowercasing is a good idea. Many
2643 # svg elements, however, are camel case
2644 if not self.svg_attr_map:
2645 lower=[attr.lower() for attr in self.svg_attributes]
2646 mix=[a for a in self.svg_attributes if a not in lower]
2647 self.svg_attributes = lower
2648 self.svg_attr_map = dict([(a.lower(),a) for a in mix])
2649
2650 lower=[attr.lower() for attr in self.svg_elements]
2651 mix=[a for a in self.svg_elements if a not in lower]
2652 self.svg_elements = lower
2653 self.svg_elem_map = dict([(a.lower(),a) for a in mix])
2654 acceptable_attributes = self.svg_attributes
2655 tag = self.svg_elem_map.get(tag,tag)
2656 keymap = self.svg_attr_map
2657 elif not tag in self.acceptable_elements:
2658 return
2659
2660 # declare xlink namespace, if needed
2661 if self.mathmlOK or self.svgOK:
2662 if filter(lambda (n,v): n.startswith('xlink:'),attrs):
2663 if not ('xmlns:xlink','http://www.w3.org/1999/xlink') in attrs:
2664 attrs.append(('xmlns:xlink','http://www.w3.org/1999/xlink'))
2665
2666 clean_attrs = []
2667 for key, value in self.normalize_attrs(attrs):
2668 if key in acceptable_attributes:
2669 key=keymap.get(key,key)
2670 # make sure the uri uses an acceptable uri scheme
2671 if key == u'href':
2672 value = _makeSafeAbsoluteURI(value)
2673 clean_attrs.append((key,value))
2674 elif key=='style':
2675 clean_value = self.sanitize_style(value)
2676 if clean_value:
2677 clean_attrs.append((key,clean_value))
2678 _BaseHTMLProcessor.unknown_starttag(self, tag, clean_attrs)
2679
2680 def unknown_endtag(self, tag):
2681 if not tag in self.acceptable_elements:
2682 if tag in self.unacceptable_elements_with_end_tag:
2683 self.unacceptablestack -= 1
2684 if self.mathmlOK and tag in self.mathml_elements:
2685 if tag == 'math' and self.mathmlOK:
2686 self.mathmlOK -= 1
2687 elif self.svgOK and tag in self.svg_elements:
2688 tag = self.svg_elem_map.get(tag,tag)
2689 if tag == 'svg' and self.svgOK:
2690 self.svgOK -= 1
2691 else:
2692 return
2693 _BaseHTMLProcessor.unknown_endtag(self, tag)
2694
2695 def handle_pi(self, text):
2696 pass
2697
2698 def handle_decl(self, text):
2699 pass
2700
2701 def handle_data(self, text):
2702 if not self.unacceptablestack:
2703 _BaseHTMLProcessor.handle_data(self, text)
2704
2705 def sanitize_style(self, style):
2706 # disallow urls
2707 style=re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ',style)
2708
2709 # gauntlet
2710 if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
2711 return ''
2712 # This replaced a regexp that used re.match and was prone to pathological back-tracking.
2713 if re.sub("\s*[-\w]+\s*:\s*[^:;]*;?", '', style).strip():
2714 return ''
2715
2716 clean = []
2717 for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style):
2718 if not value:
2719 continue
2720 if prop.lower() in self.acceptable_css_properties:
2721 clean.append(prop + ': ' + value + ';')
2722 elif prop.split('-')[0].lower() in ['background','border','margin','padding']:
2723 for keyword in value.split():
2724 if not keyword in self.acceptable_css_keywords and \
2725 not self.valid_css_values.match(keyword):
2726 break
2727 else:
2728 clean.append(prop + ': ' + value + ';')
2729 elif self.svgOK and prop.lower() in self.acceptable_svg_properties:
2730 clean.append(prop + ': ' + value + ';')
2731
2732 return ' '.join(clean)
2733
2734 def parse_comment(self, i, report=1):
2735 ret = _BaseHTMLProcessor.parse_comment(self, i, report)
2736 if ret >= 0:
2737 return ret
2738 # if ret == -1, this may be a malicious attempt to circumvent
2739 # sanitization, or a page-destroying unclosed comment
2740 match = re.compile(r'--[^>]*>').search(self.rawdata, i+4)
2741 if match:
2742 return match.end()
2743 # unclosed comment; deliberately fail to handle_data()
2744 return len(self.rawdata)
2745
2746
2747 def _sanitizeHTML(htmlSource, encoding, _type):
2748 if not _SGML_AVAILABLE:
2749 return htmlSource
2750 p = _HTMLSanitizer(encoding, _type)
2751 htmlSource = htmlSource.replace('<![CDATA[', '<![CDATA[')
2752 p.feed(htmlSource)
2753 data = p.output()
2754 data = data.strip().replace('\r\n', '\n')
2755 return data
2756
2757 class _FeedURLHandler(urllib2.HTTPDigestAuthHandler, urllib2.HTTPRedirectHandler, urllib2.HTTPDefaultErrorHandler):
2758 def http_error_default(self, req, fp, code, msg, headers):
2759 # The default implementation just raises HTTPError.
2760 # Forget that.
2761 fp.status = code
2762 return fp
2763
2764 def http_error_301(self, req, fp, code, msg, hdrs):
2765 result = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp,
2766 code, msg, hdrs)
2767 result.status = code
2768 result.newurl = result.geturl()
2769 return result
2770 # The default implementations in urllib2.HTTPRedirectHandler
2771 # are identical, so hardcoding a http_error_301 call above
2772 # won't affect anything
2773 http_error_300 = http_error_301
2774 http_error_302 = http_error_301
2775 http_error_303 = http_error_301
2776 http_error_307 = http_error_301
2777
2778 def http_error_401(self, req, fp, code, msg, headers):
2779 # Check if
2780 # - server requires digest auth, AND
2781 # - we tried (unsuccessfully) with basic auth, AND
2782 # If all conditions hold, parse authentication information
2783 # out of the Authorization header we sent the first time
2784 # (for the username and password) and the WWW-Authenticate
2785 # header the server sent back (for the realm) and retry
2786 # the request with the appropriate digest auth headers instead.
2787 # This evil genius hack has been brought to you by Aaron Swartz.
2788 host = urlparse.urlparse(req.get_full_url())[1]
2789 if base64 is None or 'Authorization' not in req.headers \
2790 or 'WWW-Authenticate' not in headers:
2791 return self.http_error_default(req, fp, code, msg, headers)
2792 auth = _base64decode(req.headers['Authorization'].split(' ')[1])
2793 user, passw = auth.split(':')
2794 realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0]
2795 self.add_password(realm, host, user, passw)
2796 retry = self.http_error_auth_reqed('www-authenticate', host, req, headers)
2797 self.reset_retry_count()
2798 return retry
2799
2800 def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers):
2801 """URL, filename, or string --> stream
2802
2803 This function lets you define parsers that take any input source
2804 (URL, pathname to local or network file, or actual data as a string)
2805 and deal with it in a uniform manner. Returned object is guaranteed
2806 to have all the basic stdio read methods (read, readline, readlines).
2807 Just .close() the object when you're done with it.
2808
2809 If the etag argument is supplied, it will be used as the value of an
2810 If-None-Match request header.
2811
2812 If the modified argument is supplied, it can be a tuple of 9 integers
2813 (as returned by gmtime() in the standard Python time module) or a date
2814 string in any format supported by feedparser. Regardless, it MUST
2815 be in GMT (Greenwich Mean Time). It will be reformatted into an
2816 RFC 1123-compliant date and used as the value of an If-Modified-Since
2817 request header.
2818
2819 If the agent argument is supplied, it will be used as the value of a
2820 User-Agent request header.
2821
2822 If the referrer argument is supplied, it will be used as the value of a
2823 Referer[sic] request header.
2824
2825 If handlers is supplied, it is a list of handlers used to build a
2826 urllib2 opener.
2827
2828 if request_headers is supplied it is a dictionary of HTTP request headers
2829 that will override the values generated by FeedParser.
2830
2831 :return: A :class:`StringIO.StringIO` or :class:`io.BytesIO`.
2832 """
2833
2834 if hasattr(url_file_stream_or_string, 'read'):
2835 return url_file_stream_or_string
2836
2837 if isinstance(url_file_stream_or_string, basestring) \
2838 and urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp', 'file', 'feed'):
2839 # Deal with the feed URI scheme
2840 if url_file_stream_or_string.startswith('feed:http'):
2841 url_file_stream_or_string = url_file_stream_or_string[5:]
2842 elif url_file_stream_or_string.startswith('feed:'):
2843 url_file_stream_or_string = 'http:' + url_file_stream_or_string[5:]
2844 if not agent:
2845 agent = USER_AGENT
2846 # Test for inline user:password credentials for HTTP basic auth
2847 auth = None
2848 if base64 and not url_file_stream_or_string.startswith('ftp:'):
2849 urltype, rest = urllib.splittype(url_file_stream_or_string)
2850 realhost, rest = urllib.splithost(rest)
2851 if realhost:
2852 user_passwd, realhost = urllib.splituser(realhost)
2853 if user_passwd:
2854 url_file_stream_or_string = '%s://%s%s' % (urltype, realhost, rest)
2855 auth = base64.standard_b64encode(user_passwd).strip()
2856
2857 # iri support
2858 if isinstance(url_file_stream_or_string, unicode):
2859 url_file_stream_or_string = _convert_to_idn(url_file_stream_or_string)
2860
2861 # try to open with urllib2 (to use optional headers)
2862 request = _build_urllib2_request(url_file_stream_or_string, agent, etag, modified, referrer, auth, request_headers)
2863 opener = urllib2.build_opener(*tuple(handlers + [_FeedURLHandler()]))
2864 opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
2865 try:
2866 return opener.open(request)
2867 finally:
2868 opener.close() # JohnD
2869
2870 # try to open with native open function (if url_file_stream_or_string is a filename)
2871 try:
2872 return open(url_file_stream_or_string, 'rb')
2873 except (IOError, UnicodeEncodeError, TypeError):
2874 # if url_file_stream_or_string is a unicode object that
2875 # cannot be converted to the encoding returned by
2876 # sys.getfilesystemencoding(), a UnicodeEncodeError
2877 # will be thrown
2878 # If url_file_stream_or_string is a string that contains NULL
2879 # (such as an XML document encoded in UTF-32), TypeError will
2880 # be thrown.
2881 pass
2882
2883 # treat url_file_stream_or_string as string
2884 if isinstance(url_file_stream_or_string, unicode):
2885 return _StringIO(url_file_stream_or_string.encode('utf-8'))
2886 return _StringIO(url_file_stream_or_string)
2887
2888 def _convert_to_idn(url):
2889 """Convert a URL to IDN notation"""
2890 # this function should only be called with a unicode string
2891 # strategy: if the host cannot be encoded in ascii, then
2892 # it'll be necessary to encode it in idn form
2893 parts = list(urlparse.urlsplit(url))
2894 try:
2895 parts[1].encode('ascii')
2896 except UnicodeEncodeError:
2897 # the url needs to be converted to idn notation
2898 host = parts[1].rsplit(':', 1)
2899 newhost = []
2900 port = u''
2901 if len(host) == 2:
2902 port = host.pop()
2903 for h in host[0].split('.'):
2904 newhost.append(h.encode('idna').decode('utf-8'))
2905 parts[1] = '.'.join(newhost)
2906 if port:
2907 parts[1] += ':' + port
2908 return urlparse.urlunsplit(parts)
2909 else:
2910 return url
2911
2912 def _build_urllib2_request(url, agent, etag, modified, referrer, auth, request_headers):
2913 request = urllib2.Request(url)
2914 request.add_header('User-Agent', agent)
2915 if etag:
2916 request.add_header('If-None-Match', etag)
2917 if isinstance(modified, basestring):
2918 modified = _parse_date(modified)
2919 elif isinstance(modified, datetime.datetime):
2920 modified = modified.utctimetuple()
2921 if modified:
2922 # format into an RFC 1123-compliant timestamp. We can't use
2923 # time.strftime() since the %a and %b directives can be affected
2924 # by the current locale, but RFC 2616 states that dates must be
2925 # in English.
2926 short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
2927 months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
2928 request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))
2929 if referrer:
2930 request.add_header('Referer', referrer)
2931 if gzip and zlib:
2932 request.add_header('Accept-encoding', 'gzip, deflate')
2933 elif gzip:
2934 request.add_header('Accept-encoding', 'gzip')
2935 elif zlib:
2936 request.add_header('Accept-encoding', 'deflate')
2937 else:
2938 request.add_header('Accept-encoding', '')
2939 if auth:
2940 request.add_header('Authorization', 'Basic %s' % auth)
2941 if ACCEPT_HEADER:
2942 request.add_header('Accept', ACCEPT_HEADER)
2943 # use this for whatever -- cookies, special headers, etc
2944 # [('Cookie','Something'),('x-special-header','Another Value')]
2945 for header_name, header_value in request_headers.items():
2946 request.add_header(header_name, header_value)
2947 request.add_header('A-IM', 'feed') # RFC 3229 support
2948 return request
2949
2950 def _parse_psc_chapter_start(start):
2951 FORMAT = r'^((\d{2}):)?(\d{2}):(\d{2})(\.(\d{3}))?$'
2952
2953 m = re.compile(FORMAT).match(start)
2954 if m is None:
2955 return None
2956
2957 _, h, m, s, _, ms = m.groups()
2958 h, m, s, ms = (int(h or 0), int(m), int(s), int(ms or 0))
2959 return datetime.timedelta(0, h*60*60 + m*60 + s, ms*1000)
2960
2961 _date_handlers = []
2962 def registerDateHandler(func):
2963 '''Register a date handler function (takes string, returns 9-tuple date in GMT)'''
2964 _date_handlers.insert(0, func)
2965
2966 # ISO-8601 date parsing routines written by Fazal Majid.
2967 # The ISO 8601 standard is very convoluted and irregular - a full ISO 8601
2968 # parser is beyond the scope of feedparser and would be a worthwhile addition
2969 # to the Python library.
2970 # A single regular expression cannot parse ISO 8601 date formats into groups
2971 # as the standard is highly irregular (for instance is 030104 2003-01-04 or
2972 # 0301-04-01), so we use templates instead.
2973 # Please note the order in templates is significant because we need a
2974 # greedy match.
2975 _iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-0MM?-?DD', 'YYYY-MM', 'YYYY-?OOO',
2976 'YY-?MM-?DD', 'YY-?OOO', 'YYYY',
2977 '-YY-?MM', '-OOO', '-YY',
2978 '--MM-?DD', '--MM',
2979 '---DD',
2980 'CC', '']
2981 _iso8601_re = [
2982 tmpl.replace(
2983 'YYYY', r'(?P<year>\d{4})').replace(
2984 'YY', r'(?P<year>\d\d)').replace(
2985 'MM', r'(?P<month>[01]\d)').replace(
2986 'DD', r'(?P<day>[0123]\d)').replace(
2987 'OOO', r'(?P<ordinal>[0123]\d\d)').replace(
2988 'CC', r'(?P<century>\d\d$)')
2989 + r'(T?(?P<hour>\d{2}):(?P<minute>\d{2})'
2990 + r'(:(?P<second>\d{2}))?'
2991 + r'(\.(?P<fracsecond>\d+))?'
2992 + r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?'
2993 for tmpl in _iso8601_tmpl]
2994 try:
2995 del tmpl
2996 except NameError:
2997 pass
2998 _iso8601_matches = [re.compile(regex).match for regex in _iso8601_re]
2999 try:
3000 del regex
3001 except NameError:
3002 pass
3003
3004 def _parse_date_iso8601(dateString):
3005 '''Parse a variety of ISO-8601-compatible formats like 20040105'''
3006 m = None
3007 for _iso8601_match in _iso8601_matches:
3008 m = _iso8601_match(dateString)
3009 if m:
3010 break
3011 if not m:
3012 return
3013 if m.span() == (0, 0):
3014 return
3015 params = m.groupdict()
3016 ordinal = params.get('ordinal', 0)
3017 if ordinal:
3018 ordinal = int(ordinal)
3019 else:
3020 ordinal = 0
3021 year = params.get('year', '--')
3022 if not year or year == '--':
3023 year = time.gmtime()[0]
3024 elif len(year) == 2:
3025 # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993
3026 year = 100 * int(time.gmtime()[0] / 100) + int(year)
3027 else:
3028 year = int(year)
3029 month = params.get('month', '-')
3030 if not month or month == '-':
3031 # ordinals are NOT normalized by mktime, we simulate them
3032 # by setting month=1, day=ordinal
3033 if ordinal:
3034 month = 1
3035 else:
3036 month = time.gmtime()[1]
3037 month = int(month)
3038 day = params.get('day', 0)
3039 if not day:
3040 # see above
3041 if ordinal:
3042 day = ordinal
3043 elif params.get('century', 0) or \
3044 params.get('year', 0) or params.get('month', 0):
3045 day = 1
3046 else:
3047 day = time.gmtime()[2]
3048 else:
3049 day = int(day)
3050 # special case of the century - is the first year of the 21st century
3051 # 2000 or 2001 ? The debate goes on...
3052 if 'century' in params:
3053 year = (int(params['century']) - 1) * 100 + 1
3054 # in ISO 8601 most fields are optional
3055 for field in ['hour', 'minute', 'second', 'tzhour', 'tzmin']:
3056 if not params.get(field, None):
3057 params[field] = 0
3058 hour = int(params.get('hour', 0))
3059 minute = int(params.get('minute', 0))
3060 second = int(float(params.get('second', 0)))
3061 # weekday is normalized by mktime(), we can ignore it
3062 weekday = 0
3063 daylight_savings_flag = -1
3064 tm = [year, month, day, hour, minute, second, weekday,
3065 ordinal, daylight_savings_flag]
3066 # ISO 8601 time zone adjustments
3067 tz = params.get('tz')
3068 if tz and tz != 'Z':
3069 if tz[0] == '-':
3070 tm[3] += int(params.get('tzhour', 0))
3071 tm[4] += int(params.get('tzmin', 0))
3072 elif tz[0] == '+':
3073 tm[3] -= int(params.get('tzhour', 0))
3074 tm[4] -= int(params.get('tzmin', 0))
3075 else:
3076 return None
3077 # Python's time.mktime() is a wrapper around the ANSI C mktime(3c)
3078 # which is guaranteed to normalize d/m/y/h/m/s.
3079 # Many implementations have bugs, but we'll pretend they don't.
3080 return time.localtime(time.mktime(tuple(tm)))
3081 registerDateHandler(_parse_date_iso8601)
3082
3083 # 8-bit date handling routines written by ytrewq1.
3084 _korean_year = u'\ub144' # b3e2 in euc-kr
3085 _korean_month = u'\uc6d4' # bff9 in euc-kr
3086 _korean_day = u'\uc77c' # c0cf in euc-kr
3087 _korean_am = u'\uc624\uc804' # bfc0 c0fc in euc-kr
3088 _korean_pm = u'\uc624\ud6c4' # bfc0 c8c4 in euc-kr
3089
3090 _korean_onblog_date_re = \
3091 re.compile('(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' % \
3092 (_korean_year, _korean_month, _korean_day))
3093 _korean_nate_date_re = \
3094 re.compile(u'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \
3095 (_korean_am, _korean_pm))
3096 def _parse_date_onblog(dateString):
3097 '''Parse a string according to the OnBlog 8-bit date format'''
3098 m = _korean_onblog_date_re.match(dateString)
3099 if not m:
3100 return
3101 w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
3102 {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
3103 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
3104 'zonediff': '+09:00'}
3105 return _parse_date_w3dtf(w3dtfdate)
3106 registerDateHandler(_parse_date_onblog)
3107
3108 def _parse_date_nate(dateString):
3109 '''Parse a string according to the Nate 8-bit date format'''
3110 m = _korean_nate_date_re.match(dateString)
3111 if not m:
3112 return
3113 hour = int(m.group(5))
3114 ampm = m.group(4)
3115 if (ampm == _korean_pm):
3116 hour += 12
3117 hour = str(hour)
3118 if len(hour) == 1:
3119 hour = '0' + hour
3120 w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
3121 {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
3122 'hour': hour, 'minute': m.group(6), 'second': m.group(7),\
3123 'zonediff': '+09:00'}
3124 return _parse_date_w3dtf(w3dtfdate)
3125 registerDateHandler(_parse_date_nate)
3126
3127 # Unicode strings for Greek date strings
3128 _greek_months = \
3129 { \
3130 u'\u0399\u03b1\u03bd': u'Jan', # c9e1ed in iso-8859-7
3131 u'\u03a6\u03b5\u03b2': u'Feb', # d6e5e2 in iso-8859-7
3132 u'\u039c\u03ac\u03ce': u'Mar', # ccdcfe in iso-8859-7
3133 u'\u039c\u03b1\u03ce': u'Mar', # cce1fe in iso-8859-7
3134 u'\u0391\u03c0\u03c1': u'Apr', # c1f0f1 in iso-8859-7
3135 u'\u039c\u03ac\u03b9': u'May', # ccdce9 in iso-8859-7
3136 u'\u039c\u03b1\u03ca': u'May', # cce1fa in iso-8859-7
3137 u'\u039c\u03b1\u03b9': u'May', # cce1e9 in iso-8859-7
3138 u'\u0399\u03bf\u03cd\u03bd': u'Jun', # c9effded in iso-8859-7
3139 u'\u0399\u03bf\u03bd': u'Jun', # c9efed in iso-8859-7
3140 u'\u0399\u03bf\u03cd\u03bb': u'Jul', # c9effdeb in iso-8859-7
3141 u'\u0399\u03bf\u03bb': u'Jul', # c9f9eb in iso-8859-7
3142 u'\u0391\u03cd\u03b3': u'Aug', # c1fde3 in iso-8859-7
3143 u'\u0391\u03c5\u03b3': u'Aug', # c1f5e3 in iso-8859-7
3144 u'\u03a3\u03b5\u03c0': u'Sep', # d3e5f0 in iso-8859-7
3145 u'\u039f\u03ba\u03c4': u'Oct', # cfeaf4 in iso-8859-7
3146 u'\u039d\u03bf\u03ad': u'Nov', # cdefdd in iso-8859-7
3147 u'\u039d\u03bf\u03b5': u'Nov', # cdefe5 in iso-8859-7
3148 u'\u0394\u03b5\u03ba': u'Dec', # c4e5ea in iso-8859-7
3149 }
3150
3151 _greek_wdays = \
3152 { \
3153 u'\u039a\u03c5\u03c1': u'Sun', # caf5f1 in iso-8859-7
3154 u'\u0394\u03b5\u03c5': u'Mon', # c4e5f5 in iso-8859-7
3155 u'\u03a4\u03c1\u03b9': u'Tue', # d4f1e9 in iso-8859-7
3156 u'\u03a4\u03b5\u03c4': u'Wed', # d4e5f4 in iso-8859-7
3157 u'\u03a0\u03b5\u03bc': u'Thu', # d0e5ec in iso-8859-7
3158 u'\u03a0\u03b1\u03c1': u'Fri', # d0e1f1 in iso-8859-7
3159 u'\u03a3\u03b1\u03b2': u'Sat', # d3e1e2 in iso-8859-7
3160 }
3161
3162 _greek_date_format_re = \
3163 re.compile(u'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)')
3164
3165 def _parse_date_greek(dateString):
3166 '''Parse a string according to a Greek 8-bit date format.'''
3167 m = _greek_date_format_re.match(dateString)
3168 if not m:
3169 return
3170 wday = _greek_wdays[m.group(1)]
3171 month = _greek_months[m.group(3)]
3172 rfc822date = '%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s' % \
3173 {'wday': wday, 'day': m.group(2), 'month': month, 'year': m.group(4),\
3174 'hour': m.group(5), 'minute': m.group(6), 'second': m.group(7),\
3175 'zonediff': m.group(8)}
3176 return _parse_date_rfc822(rfc822date)
3177 registerDateHandler(_parse_date_greek)
3178
3179 # Unicode strings for Hungarian date strings
3180 _hungarian_months = \
3181 { \
3182 u'janu\u00e1r': u'01', # e1 in iso-8859-2
3183 u'febru\u00e1ri': u'02', # e1 in iso-8859-2
3184 u'm\u00e1rcius': u'03', # e1 in iso-8859-2
3185 u'\u00e1prilis': u'04', # e1 in iso-8859-2
3186 u'm\u00e1ujus': u'05', # e1 in iso-8859-2
3187 u'j\u00fanius': u'06', # fa in iso-8859-2
3188 u'j\u00falius': u'07', # fa in iso-8859-2
3189 u'augusztus': u'08',
3190 u'szeptember': u'09',
3191 u'okt\u00f3ber': u'10', # f3 in iso-8859-2
3192 u'november': u'11',
3193 u'december': u'12',
3194 }
3195
3196 _hungarian_date_format_re = \
3197 re.compile(u'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))')
3198
3199 def _parse_date_hungarian(dateString):
3200 '''Parse a string according to a Hungarian 8-bit date format.'''
3201 m = _hungarian_date_format_re.match(dateString)
3202 if not m or m.group(2) not in _hungarian_months:
3203 return None
3204 month = _hungarian_months[m.group(2)]
3205 day = m.group(3)
3206 if len(day) == 1:
3207 day = '0' + day
3208 hour = m.group(4)
3209 if len(hour) == 1:
3210 hour = '0' + hour
3211 w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s' % \
3212 {'year': m.group(1), 'month': month, 'day': day,\
3213 'hour': hour, 'minute': m.group(5),\
3214 'zonediff': m.group(6)}
3215 return _parse_date_w3dtf(w3dtfdate)
3216 registerDateHandler(_parse_date_hungarian)
3217
3218 timezonenames = {
3219 'ut': 0, 'gmt': 0, 'z': 0,
3220 'adt': -3, 'ast': -4, 'at': -4,
3221 'edt': -4, 'est': -5, 'et': -5,
3222 'cdt': -5, 'cst': -6, 'ct': -6,
3223 'mdt': -6, 'mst': -7, 'mt': -7,
3224 'pdt': -7, 'pst': -8, 'pt': -8,
3225 'a': -1, 'n': 1,
3226 'm': -12, 'y': 12,
3227 }
3228 # W3 date and time format parser
3229 # http://www.w3.org/TR/NOTE-datetime
3230 # Also supports MSSQL-style datetimes as defined at:
3231 # http://msdn.microsoft.com/en-us/library/ms186724.aspx
3232 # (basically, allow a space as a date/time/timezone separator)
3233 def _parse_date_w3dtf(datestr):
3234 if not datestr.strip():
3235 return None
3236 parts = datestr.lower().split('t')
3237 if len(parts) == 1:
3238 # This may be a date only, or may be an MSSQL-style date
3239 parts = parts[0].split()
3240 if len(parts) == 1:
3241 # Treat this as a date only
3242 parts.append('00:00:00z')
3243 elif len(parts) > 2:
3244 return None
3245 date = parts[0].split('-', 2)
3246 if not date or len(date[0]) != 4:
3247 return None
3248 # Ensure that `date` has 3 elements. Using '1' sets the default
3249 # month to January and the default day to the 1st of the month.
3250 date.extend(['1'] * (3 - len(date)))
3251 try:
3252 year, month, day = [int(i) for i in date]
3253 except ValueError:
3254 # `date` may have more than 3 elements or may contain
3255 # non-integer strings.
3256 return None
3257 if parts[1].endswith('z'):
3258 parts[1] = parts[1][:-1]
3259 parts.append('z')
3260 # Append the numeric timezone offset, if any, to parts.
3261 # If this is an MSSQL-style date then parts[2] already contains
3262 # the timezone information, so `append()` will not affect it.
3263 # Add 1 to each value so that if `find()` returns -1 it will be
3264 # treated as False.
3265 loc = parts[1].find('-') + 1 or parts[1].find('+') + 1 or len(parts[1]) + 1
3266 loc = loc - 1
3267 parts.append(parts[1][loc:])
3268 parts[1] = parts[1][:loc]
3269 time = parts[1].split(':', 2)
3270 # Ensure that time has 3 elements. Using '0' means that the
3271 # minutes and seconds, if missing, will default to 0.
3272 time.extend(['0'] * (3 - len(time)))
3273 tzhour = 0
3274 tzmin = 0
3275 if parts[2][:1] in ('-', '+'):
3276 try:
3277 tzhour = int(parts[2][1:3])
3278 tzmin = int(parts[2][4:])
3279 except ValueError:
3280 return None
3281 if parts[2].startswith('-'):
3282 tzhour = tzhour * -1
3283 tzmin = tzmin * -1
3284 else:
3285 tzhour = timezonenames.get(parts[2], 0)
3286 try:
3287 hour, minute, second = [int(float(i)) for i in time]
3288 except ValueError:
3289 return None
3290 # Create the datetime object and timezone delta objects
3291 try:
3292 stamp = datetime.datetime(year, month, day, hour, minute, second)
3293 except ValueError:
3294 return None
3295 delta = datetime.timedelta(0, 0, 0, 0, tzmin, tzhour)
3296 # Return the date and timestamp in a UTC 9-tuple
3297 try:
3298 return (stamp - delta).utctimetuple()
3299 except (OverflowError, ValueError):
3300 # IronPython throws ValueErrors instead of OverflowErrors
3301 return None
3302
3303 registerDateHandler(_parse_date_w3dtf)
3304
3305 def _parse_date_rfc822(date):
3306 """Parse RFC 822 dates and times
3307 http://tools.ietf.org/html/rfc822#section-5
3308
3309 There are some formatting differences that are accounted for:
3310 1. Years may be two or four digits.
3311 2. The month and day can be swapped.
3312 3. Additional timezone names are supported.
3313 4. A default time and timezone are assumed if only a date is present.
3314 """
3315 daynames = set(['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'])
3316 months = {
3317 'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6,
3318 'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12,
3319 }
3320
3321 parts = date.lower().split()
3322 if len(parts) < 5:
3323 # Assume that the time and timezone are missing
3324 parts.extend(('00:00:00', '0000'))
3325 # Remove the day name
3326 if parts[0][:3] in daynames:
3327 parts = parts[1:]
3328 if len(parts) < 5:
3329 # If there are still fewer than five parts, there's not enough
3330 # information to interpret this
3331 return None
3332 try:
3333 day = int(parts[0])
3334 except ValueError:
3335 # Check if the day and month are swapped
3336 if months.get(parts[0][:3]):
3337 try:
3338 day = int(parts[1])
3339 except ValueError:
3340 return None
3341 else:
3342 parts[1] = parts[0]
3343 else:
3344 return None
3345 month = months.get(parts[1][:3])
3346 if not month:
3347 return None
3348 try:
3349 year = int(parts[2])
3350 except ValueError:
3351 return None
3352 # Normalize two-digit years:
3353 # Anything in the 90's is interpreted as 1990 and on
3354 # Anything 89 or less is interpreted as 2089 or before
3355 if len(parts[2]) <= 2:
3356 year += (1900, 2000)[year < 90]
3357 timeparts = parts[3].split(':')
3358 timeparts = timeparts + ([0] * (3 - len(timeparts)))
3359 try:
3360 (hour, minute, second) = map(int, timeparts)
3361 except ValueError:
3362 return None
3363 tzhour = 0
3364 tzmin = 0
3365 # Strip 'Etc/' from the timezone
3366 if parts[4].startswith('etc/'):
3367 parts[4] = parts[4][4:]
3368 # Normalize timezones that start with 'gmt':
3369 # GMT-05:00 => -0500
3370 # GMT => GMT
3371 if parts[4].startswith('gmt'):
3372 parts[4] = ''.join(parts[4][3:].split(':')) or 'gmt'
3373 # Handle timezones like '-0500', '+0500', and 'EST'
3374 if parts[4] and parts[4][0] in ('-', '+'):
3375 try:
3376 tzhour = int(parts[4][1:3])
3377 tzmin = int(parts[4][3:])
3378 except ValueError:
3379 return None
3380 if parts[4].startswith('-'):
3381 tzhour = tzhour * -1
3382 tzmin = tzmin * -1
3383 else:
3384 tzhour = timezonenames.get(parts[4], 0)
3385 # Create the datetime object and timezone delta objects
3386 try:
3387 stamp = datetime.datetime(year, month, day, hour, minute, second)
3388 except ValueError:
3389 return None
3390 delta = datetime.timedelta(0, 0, 0, 0, tzmin, tzhour)
3391 # Return the date and timestamp in a UTC 9-tuple
3392 try:
3393 return (stamp - delta).utctimetuple()
3394 except (OverflowError, ValueError):
3395 # IronPython throws ValueErrors instead of OverflowErrors
3396 return None
3397 registerDateHandler(_parse_date_rfc822)
3398
3399 _months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun',
3400 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
3401 def _parse_date_asctime(dt):
3402 """Parse asctime-style dates.
3403
3404 Converts asctime to RFC822-compatible dates and uses the RFC822 parser
3405 to do the actual parsing.
3406
3407 Supported formats (format is standardized to the first one listed):
3408
3409 * {weekday name} {month name} dd hh:mm:ss {+-tz} yyyy
3410 * {weekday name} {month name} dd hh:mm:ss yyyy
3411 """
3412
3413 parts = dt.split()
3414
3415 # Insert a GMT timezone, if needed.
3416 if len(parts) == 5:
3417 parts.insert(4, '+0000')
3418
3419 # Exit if there are not six parts.
3420 if len(parts) != 6:
3421 return None
3422
3423 # Reassemble the parts in an RFC822-compatible order and parse them.
3424 return _parse_date_rfc822(' '.join([
3425 parts[0], parts[2], parts[1], parts[5], parts[3], parts[4],
3426 ]))
3427 registerDateHandler(_parse_date_asctime)
3428
3429 def _parse_date_perforce(aDateString):
3430 """parse a date in yyyy/mm/dd hh:mm:ss TTT format"""
3431 # Fri, 2006/09/15 08:19:53 EDT
3432 _my_date_pattern = re.compile( \
3433 r'(\w{,3}), (\d{,4})/(\d{,2})/(\d{2}) (\d{,2}):(\d{2}):(\d{2}) (\w{,3})')
3434
3435 m = _my_date_pattern.search(aDateString)
3436 if m is None:
3437 return None
3438 dow, year, month, day, hour, minute, second, tz = m.groups()
3439 months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
3440 dateString = "%s, %s %s %s %s:%s:%s %s" % (dow, day, months[int(month) - 1], year, hour, minute, second, tz)
3441 tm = rfc822.parsedate_tz(dateString)
3442 if tm:
3443 return time.gmtime(rfc822.mktime_tz(tm))
3444 registerDateHandler(_parse_date_perforce)
3445
3446 def _parse_date(dateString):
3447 '''Parses a variety of date formats into a 9-tuple in GMT'''
3448 if not dateString:
3449 return None
3450 for handler in _date_handlers:
3451 try:
3452 date9tuple = handler(dateString)
3453 except (KeyError, OverflowError, ValueError):
3454 continue
3455 if not date9tuple:
3456 continue
3457 if len(date9tuple) != 9:
3458 continue
3459 return date9tuple
3460 return None
3461
3462 # Each marker represents some of the characters of the opening XML
3463 # processing instruction ('<?xm') in the specified encoding.
3464 EBCDIC_MARKER = _l2bytes([0x4C, 0x6F, 0xA7, 0x94])
3465 UTF16BE_MARKER = _l2bytes([0x00, 0x3C, 0x00, 0x3F])
3466 UTF16LE_MARKER = _l2bytes([0x3C, 0x00, 0x3F, 0x00])
3467 UTF32BE_MARKER = _l2bytes([0x00, 0x00, 0x00, 0x3C])
3468 UTF32LE_MARKER = _l2bytes([0x3C, 0x00, 0x00, 0x00])
3469
3470 ZERO_BYTES = _l2bytes([0x00, 0x00])
3471
3472 # Match the opening XML declaration.
3473 # Example: <?xml version="1.0" encoding="utf-8"?>
3474 RE_XML_DECLARATION = re.compile('^<\?xml[^>]*?>')
3475
3476 # Capture the value of the XML processing instruction's encoding attribute.
3477 # Example: <?xml version="1.0" encoding="utf-8"?>
3478 RE_XML_PI_ENCODING = re.compile(_s2bytes('^<\?.*encoding=[\'"](.*?)[\'"].*\?>'))
3479
3480 def convert_to_utf8(http_headers, data):
3481 '''Detect and convert the character encoding to UTF-8.
3482
3483 http_headers is a dictionary
3484 data is a raw string (not Unicode)'''
3485
3486 # This is so much trickier than it sounds, it's not even funny.
3487 # According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type
3488 # is application/xml, application/*+xml,
3489 # application/xml-external-parsed-entity, or application/xml-dtd,
3490 # the encoding given in the charset parameter of the HTTP Content-Type
3491 # takes precedence over the encoding given in the XML prefix within the
3492 # document, and defaults to 'utf-8' if neither are specified. But, if
3493 # the HTTP Content-Type is text/xml, text/*+xml, or
3494 # text/xml-external-parsed-entity, the encoding given in the XML prefix
3495 # within the document is ALWAYS IGNORED and only the encoding given in
3496 # the charset parameter of the HTTP Content-Type header should be
3497 # respected, and it defaults to 'us-ascii' if not specified.
3498
3499 # Furthermore, discussion on the atom-syntax mailing list with the
3500 # author of RFC 3023 leads me to the conclusion that any document
3501 # served with a Content-Type of text/* and no charset parameter
3502 # must be treated as us-ascii. (We now do this.) And also that it
3503 # must always be flagged as non-well-formed. (We now do this too.)
3504
3505 # If Content-Type is unspecified (input was local file or non-HTTP source)
3506 # or unrecognized (server just got it totally wrong), then go by the
3507 # encoding given in the XML prefix of the document and default to
3508 # 'iso-8859-1' as per the HTTP specification (RFC 2616).
3509
3510 # Then, assuming we didn't find a character encoding in the HTTP headers
3511 # (and the HTTP Content-type allowed us to look in the body), we need
3512 # to sniff the first few bytes of the XML data and try to determine
3513 # whether the encoding is ASCII-compatible. Section F of the XML
3514 # specification shows the way here:
3515 # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
3516
3517 # If the sniffed encoding is not ASCII-compatible, we need to make it
3518 # ASCII compatible so that we can sniff further into the XML declaration
3519 # to find the encoding attribute, which will tell us the true encoding.
3520
3521 # Of course, none of this guarantees that we will be able to parse the
3522 # feed in the declared character encoding (assuming it was declared
3523 # correctly, which many are not). iconv_codec can help a lot;
3524 # you should definitely install it if you can.
3525 # http://cjkpython.i18n.org/
3526
3527 bom_encoding = u''
3528 xml_encoding = u''
3529 rfc3023_encoding = u''
3530
3531 # Look at the first few bytes of the document to guess what
3532 # its encoding may be. We only need to decode enough of the
3533 # document that we can use an ASCII-compatible regular
3534 # expression to search for an XML encoding declaration.
3535 # The heuristic follows the XML specification, section F:
3536 # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
3537 # Check for BOMs first.
3538 if data[:4] == codecs.BOM_UTF32_BE:
3539 bom_encoding = u'utf-32be'
3540 data = data[4:]
3541 elif data[:4] == codecs.BOM_UTF32_LE:
3542 bom_encoding = u'utf-32le'
3543 data = data[4:]
3544 elif data[:2] == codecs.BOM_UTF16_BE and data[2:4] != ZERO_BYTES:
3545 bom_encoding = u'utf-16be'
3546 data = data[2:]
3547 elif data[:2] == codecs.BOM_UTF16_LE and data[2:4] != ZERO_BYTES:
3548 bom_encoding = u'utf-16le'
3549 data = data[2:]
3550 elif data[:3] == codecs.BOM_UTF8:
3551 bom_encoding = u'utf-8'
3552 data = data[3:]
3553 # Check for the characters '<?xm' in several encodings.
3554 elif data[:4] == EBCDIC_MARKER:
3555 bom_encoding = u'cp037'
3556 elif data[:4] == UTF16BE_MARKER:
3557 bom_encoding = u'utf-16be'
3558 elif data[:4] == UTF16LE_MARKER:
3559 bom_encoding = u'utf-16le'
3560 elif data[:4] == UTF32BE_MARKER:
3561 bom_encoding = u'utf-32be'
3562 elif data[:4] == UTF32LE_MARKER:
3563 bom_encoding = u'utf-32le'
3564
3565 tempdata = data
3566 try:
3567 if bom_encoding:
3568 tempdata = data.decode(bom_encoding).encode('utf-8')
3569 except (UnicodeDecodeError, LookupError):
3570 # feedparser recognizes UTF-32 encodings that aren't
3571 # available in Python 2.4 and 2.5, so it's possible to
3572 # encounter a LookupError during decoding.
3573 xml_encoding_match = None
3574 else:
3575 xml_encoding_match = RE_XML_PI_ENCODING.match(tempdata)
3576
3577 if xml_encoding_match:
3578 xml_encoding = xml_encoding_match.groups()[0].decode('utf-8').lower()
3579 # Normalize the xml_encoding if necessary.
3580 if bom_encoding and (xml_encoding in (
3581 u'u16', u'utf-16', u'utf16', u'utf_16',
3582 u'u32', u'utf-32', u'utf32', u'utf_32',
3583 u'iso-10646-ucs-2', u'iso-10646-ucs-4',
3584 u'csucs4', u'csunicode', u'ucs-2', u'ucs-4'
3585 )):
3586 xml_encoding = bom_encoding
3587
3588 # Find the HTTP Content-Type and, hopefully, a character
3589 # encoding provided by the server. The Content-Type is used
3590 # to choose the "correct" encoding among the BOM encoding,
3591 # XML declaration encoding, and HTTP encoding, following the
3592 # heuristic defined in RFC 3023.
3593 http_content_type = http_headers.get('content-type') or ''
3594 http_content_type, params = cgi.parse_header(http_content_type)
3595 http_encoding = params.get('charset', '').replace("'", "")
3596 if not isinstance(http_encoding, unicode):
3597 http_encoding = http_encoding.decode('utf-8', 'ignore')
3598
3599 acceptable_content_type = 0
3600 application_content_types = (u'application/xml', u'application/xml-dtd',
3601 u'application/xml-external-parsed-entity')
3602 text_content_types = (u'text/xml', u'text/xml-external-parsed-entity')
3603 if (http_content_type in application_content_types) or \
3604 (http_content_type.startswith(u'application/') and
3605 http_content_type.endswith(u'+xml')):
3606 acceptable_content_type = 1
3607 rfc3023_encoding = http_encoding or xml_encoding or u'utf-8'
3608 elif (http_content_type in text_content_types) or \
3609 (http_content_type.startswith(u'text/') and
3610 http_content_type.endswith(u'+xml')):
3611 acceptable_content_type = 1
3612 rfc3023_encoding = http_encoding or u'us-ascii'
3613 elif http_content_type.startswith(u'text/'):
3614 rfc3023_encoding = http_encoding or u'us-ascii'
3615 elif http_headers and 'content-type' not in http_headers:
3616 rfc3023_encoding = xml_encoding or u'iso-8859-1'
3617 else:
3618 rfc3023_encoding = xml_encoding or u'utf-8'
3619 # gb18030 is a superset of gb2312, so always replace gb2312
3620 # with gb18030 for greater compatibility.
3621 if rfc3023_encoding.lower() == u'gb2312':
3622 rfc3023_encoding = u'gb18030'
3623 if xml_encoding.lower() == u'gb2312':
3624 xml_encoding = u'gb18030'
3625
3626 # there are four encodings to keep track of:
3627 # - http_encoding is the encoding declared in the Content-Type HTTP header
3628 # - xml_encoding is the encoding declared in the <?xml declaration
3629 # - bom_encoding is the encoding sniffed from the first 4 bytes of the XML data
3630 # - rfc3023_encoding is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications
3631 error = None
3632
3633 if http_headers and (not acceptable_content_type):
3634 if 'content-type' in http_headers:
3635 msg = '%s is not an XML media type' % http_headers['content-type']
3636 else:
3637 msg = 'no Content-type specified'
3638 error = NonXMLContentType(msg)
3639
3640 # determine character encoding
3641 known_encoding = 0
3642 lazy_chardet_encoding = None
3643 tried_encodings = []
3644 if chardet:
3645 def lazy_chardet_encoding():
3646 chardet_encoding = chardet.detect(data)['encoding']
3647 if not chardet_encoding:
3648 chardet_encoding = ''
3649 if not isinstance(chardet_encoding, unicode):
3650 chardet_encoding = unicode(chardet_encoding, 'ascii', 'ignore')
3651 return chardet_encoding
3652 # try: HTTP encoding, declared XML encoding, encoding sniffed from BOM
3653 for proposed_encoding in (rfc3023_encoding, xml_encoding, bom_encoding,
3654 lazy_chardet_encoding, u'utf-8', u'windows-1252', u'iso-8859-2'):
3655 if callable(proposed_encoding):
3656 proposed_encoding = proposed_encoding()
3657 if not proposed_encoding:
3658 continue
3659 if proposed_encoding in tried_encodings:
3660 continue
3661 tried_encodings.append(proposed_encoding)
3662 try:
3663 data = data.decode(proposed_encoding)
3664 except (UnicodeDecodeError, LookupError):
3665 pass
3666 else:
3667 known_encoding = 1
3668 # Update the encoding in the opening XML processing instruction.
3669 new_declaration = '''<?xml version='1.0' encoding='utf-8'?>'''
3670 if RE_XML_DECLARATION.search(data):
3671 data = RE_XML_DECLARATION.sub(new_declaration, data)
3672 else:
3673 data = new_declaration + u'\n' + data
3674 data = data.encode('utf-8')
3675 break
3676 # if still no luck, give up
3677 if not known_encoding:
3678 error = CharacterEncodingUnknown(
3679 'document encoding unknown, I tried ' +
3680 '%s, %s, utf-8, windows-1252, and iso-8859-2 but nothing worked' %
3681 (rfc3023_encoding, xml_encoding))
3682 rfc3023_encoding = u''
3683 elif proposed_encoding != rfc3023_encoding:
3684 error = CharacterEncodingOverride(
3685 'document declared as %s, but parsed as %s' %
3686 (rfc3023_encoding, proposed_encoding))
3687 rfc3023_encoding = proposed_encoding
3688
3689 return data, rfc3023_encoding, error
3690
3691 # Match XML entity declarations.
3692 # Example: <!ENTITY copyright "(C)">
3693 RE_ENTITY_PATTERN = re.compile(_s2bytes(r'^\s*<!ENTITY([^>]*?)>'), re.MULTILINE)
3694
3695 # Match XML DOCTYPE declarations.
3696 # Example: <!DOCTYPE feed [ ]>
3697 RE_DOCTYPE_PATTERN = re.compile(_s2bytes(r'^\s*<!DOCTYPE([^>]*?)>'), re.MULTILINE)
3698
3699 # Match safe entity declarations.
3700 # This will allow hexadecimal character references through,
3701 # as well as text, but not arbitrary nested entities.
3702 # Example: cubed "³"
3703 # Example: copyright "(C)"
3704 # Forbidden: explode1 "&explode2;&explode2;"
3705 RE_SAFE_ENTITY_PATTERN = re.compile(_s2bytes('\s+(\w+)\s+"(&#\w+;|[^&"]*)"'))
3706
3707 def replace_doctype(data):
3708 '''Strips and replaces the DOCTYPE, returns (rss_version, stripped_data)
3709
3710 rss_version may be 'rss091n' or None
3711 stripped_data is the same XML document with a replaced DOCTYPE
3712 '''
3713
3714 # Divide the document into two groups by finding the location
3715 # of the first element that doesn't begin with '<?' or '<!'.
3716 start = re.search(_s2bytes('<\w'), data)
3717 start = start and start.start() or -1
3718 head, data = data[:start+1], data[start+1:]
3719
3720 # Save and then remove all of the ENTITY declarations.
3721 entity_results = RE_ENTITY_PATTERN.findall(head)
3722 head = RE_ENTITY_PATTERN.sub(_s2bytes(''), head)
3723
3724 # Find the DOCTYPE declaration and check the feed type.
3725 doctype_results = RE_DOCTYPE_PATTERN.findall(head)
3726 doctype = doctype_results and doctype_results[0] or _s2bytes('')
3727 if _s2bytes('netscape') in doctype.lower():
3728 version = u'rss091n'
3729 else:
3730 version = None
3731
3732 # Re-insert the safe ENTITY declarations if a DOCTYPE was found.
3733 replacement = _s2bytes('')
3734 if len(doctype_results) == 1 and entity_results:
3735 match_safe_entities = lambda e: RE_SAFE_ENTITY_PATTERN.match(e)
3736 safe_entities = filter(match_safe_entities, entity_results)
3737 if safe_entities:
3738 replacement = _s2bytes('<!DOCTYPE feed [\n<!ENTITY') \
3739 + _s2bytes('>\n<!ENTITY ').join(safe_entities) \
3740 + _s2bytes('>\n]>')
3741 data = RE_DOCTYPE_PATTERN.sub(replacement, head) + data
3742
3743 # Precompute the safe entities for the loose parser.
3744 safe_entities = dict((k.decode('utf-8'), v.decode('utf-8'))
3745 for k, v in RE_SAFE_ENTITY_PATTERN.findall(replacement))
3746 return version, data, safe_entities
3747
3748
3749 # GeoRSS geometry parsers. Each return a dict with 'type' and 'coordinates'
3750 # items, or None in the case of a parsing error.
3751
3752 def _parse_poslist(value, geom_type, swap=True, dims=2):
3753 if geom_type == 'linestring':
3754 return _parse_georss_line(value, swap, dims)
3755 elif geom_type == 'polygon':
3756 ring = _parse_georss_line(value, swap, dims)
3757 return {'type': u'Polygon', 'coordinates': (ring['coordinates'],)}
3758 else:
3759 return None
3760
3761 def _gen_georss_coords(value, swap=True, dims=2):
3762 # A generator of (lon, lat) pairs from a string of encoded GeoRSS
3763 # coordinates. Converts to floats and swaps order.
3764 latlons = itertools.imap(float, value.strip().replace(',', ' ').split())
3765 nxt = latlons.next
3766 while True:
3767 t = [nxt(), nxt()][::swap and -1 or 1]
3768 if dims == 3:
3769 t.append(nxt())
3770 yield tuple(t)
3771
3772 def _parse_georss_point(value, swap=True, dims=2):
3773 # A point contains a single latitude-longitude pair, separated by
3774 # whitespace. We'll also handle comma separators.
3775 try:
3776 coords = list(_gen_georss_coords(value, swap, dims))
3777 return {u'type': u'Point', u'coordinates': coords[0]}
3778 except (IndexError, ValueError):
3779 return None
3780
3781 def _parse_georss_line(value, swap=True, dims=2):
3782 # A line contains a space separated list of latitude-longitude pairs in
3783 # WGS84 coordinate reference system, with each pair separated by
3784 # whitespace. There must be at least two pairs.
3785 try:
3786 coords = list(_gen_georss_coords(value, swap, dims))
3787 return {u'type': u'LineString', u'coordinates': coords}
3788 except (IndexError, ValueError):
3789 return None
3790
3791 def _parse_georss_polygon(value, swap=True, dims=2):
3792 # A polygon contains a space separated list of latitude-longitude pairs,
3793 # with each pair separated by whitespace. There must be at least four
3794 # pairs, with the last being identical to the first (so a polygon has a
3795 # minimum of three actual points).
3796 try:
3797 ring = list(_gen_georss_coords(value, swap, dims))
3798 except (IndexError, ValueError):
3799 return None
3800 if len(ring) < 4:
3801 return None
3802 return {u'type': u'Polygon', u'coordinates': (ring,)}
3803
3804 def _parse_georss_box(value, swap=True, dims=2):
3805 # A bounding box is a rectangular region, often used to define the extents
3806 # of a map or a rough area of interest. A box contains two space seperate
3807 # latitude-longitude pairs, with each pair separated by whitespace. The
3808 # first pair is the lower corner, the second is the upper corner.
3809 try:
3810 coords = list(_gen_georss_coords(value, swap, dims))
3811 return {u'type': u'Box', u'coordinates': tuple(coords)}
3812 except (IndexError, ValueError):
3813 return None
3814
3815 # end geospatial parsers
3816
3817
3818 def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=None, request_headers=None, response_headers=None):
3819 '''Parse a feed from a URL, file, stream, or string.
3820
3821 request_headers, if given, is a dict from http header name to value to add
3822 to the request; this overrides internally generated values.
3823
3824 :return: A :class:`FeedParserDict`.
3825 '''
3826
3827 if handlers is None:
3828 handlers = []
3829 if request_headers is None:
3830 request_headers = {}
3831 if response_headers is None:
3832 response_headers = {}
3833
3834 result = FeedParserDict()
3835 result['feed'] = FeedParserDict()
3836 result['entries'] = []
3837 result['bozo'] = 0
3838 if not isinstance(handlers, list):
3839 handlers = [handlers]
3840 try:
3841 f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers)
3842 data = f.read()
3843 except Exception, e:
3844 result['bozo'] = 1
3845 result['bozo_exception'] = e
3846 data = None
3847 f = None
3848
3849 if hasattr(f, 'headers'):
3850 result['headers'] = dict(f.headers)
3851 # overwrite existing headers using response_headers
3852 if 'headers' in result:
3853 result['headers'].update(response_headers)
3854 elif response_headers:
3855 result['headers'] = copy.deepcopy(response_headers)
3856
3857 # lowercase all of the HTTP headers for comparisons per RFC 2616
3858 if 'headers' in result:
3859 http_headers = dict((k.lower(), v) for k, v in result['headers'].items())
3860 else:
3861 http_headers = {}
3862
3863 # if feed is gzip-compressed, decompress it
3864 if f and data and http_headers:
3865 if gzip and 'gzip' in http_headers.get('content-encoding', ''):
3866 try:
3867 data = gzip.GzipFile(fileobj=_StringIO(data)).read()
3868 except (IOError, struct.error), e:
3869 # IOError can occur if the gzip header is bad.
3870 # struct.error can occur if the data is damaged.
3871 result['bozo'] = 1
3872 result['bozo_exception'] = e
3873 if isinstance(e, struct.error):
3874 # A gzip header was found but the data is corrupt.
3875 # Ideally, we should re-request the feed without the
3876 # 'Accept-encoding: gzip' header, but we don't.
3877 data = None
3878 elif zlib and 'deflate' in http_headers.get('content-encoding', ''):
3879 try:
3880 data = zlib.decompress(data)
3881 except zlib.error, e:
3882 try:
3883 # The data may have no headers and no checksum.
3884 data = zlib.decompress(data, -15)
3885 except zlib.error, e:
3886 result['bozo'] = 1
3887 result['bozo_exception'] = e
3888
3889 # save HTTP headers
3890 if http_headers:
3891 if 'etag' in http_headers:
3892 etag = http_headers.get('etag', u'')
3893 if not isinstance(etag, unicode):
3894 etag = etag.decode('utf-8', 'ignore')
3895 if etag:
3896 result['etag'] = etag
3897 if 'last-modified' in http_headers:
3898 modified = http_headers.get('last-modified', u'')
3899 if modified:
3900 result['modified'] = modified
3901 result['modified_parsed'] = _parse_date(modified)
3902 if hasattr(f, 'url'):
3903 if not isinstance(f.url, unicode):
3904 result['href'] = f.url.decode('utf-8', 'ignore')
3905 else:
3906 result['href'] = f.url
3907 result['status'] = 200
3908 if hasattr(f, 'status'):
3909 result['status'] = f.status
3910 if hasattr(f, 'close'):
3911 f.close()
3912
3913 if data is None:
3914 return result
3915
3916 # Stop processing if the server sent HTTP 304 Not Modified.
3917 if getattr(f, 'code', 0) == 304:
3918 result['version'] = u''
3919 result['debug_message'] = 'The feed has not changed since you last checked, ' + \
3920 'so the server sent no data. This is a feature, not a bug!'
3921 return result
3922
3923 data, result['encoding'], error = convert_to_utf8(http_headers, data)
3924 use_strict_parser = result['encoding'] and True or False
3925 if error is not None:
3926 result['bozo'] = 1
3927 result['bozo_exception'] = error
3928
3929 result['version'], data, entities = replace_doctype(data)
3930
3931 # Ensure that baseuri is an absolute URI using an acceptable URI scheme.
3932 contentloc = http_headers.get('content-location', u'')
3933 href = result.get('href', u'')
3934 baseuri = _makeSafeAbsoluteURI(href, contentloc) or _makeSafeAbsoluteURI(contentloc) or href
3935
3936 baselang = http_headers.get('content-language', None)
3937 if not isinstance(baselang, unicode) and baselang is not None:
3938 baselang = baselang.decode('utf-8', 'ignore')
3939
3940 if not _XML_AVAILABLE:
3941 use_strict_parser = 0
3942 if use_strict_parser:
3943 # initialize the SAX parser
3944 feedparser = _StrictFeedParser(baseuri, baselang, 'utf-8')
3945 saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS)
3946 saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
3947 try:
3948 # disable downloading external doctype references, if possible
3949 saxparser.setFeature(xml.sax.handler.feature_external_ges, 0)
3950 except xml.sax.SAXNotSupportedException:
3951 pass
3952 saxparser.setContentHandler(feedparser)
3953 saxparser.setErrorHandler(feedparser)
3954 source = xml.sax.xmlreader.InputSource()
3955 source.setByteStream(_StringIO(data))
3956 try:
3957 saxparser.parse(source)
3958 except xml.sax.SAXException, e:
3959 result['bozo'] = 1
3960 result['bozo_exception'] = feedparser.exc or e
3961 use_strict_parser = 0
3962 if not use_strict_parser and _SGML_AVAILABLE:
3963 feedparser = _LooseFeedParser(baseuri, baselang, 'utf-8', entities)
3964 feedparser.feed(data.decode('utf-8', 'replace'))
3965 result['feed'] = feedparser.feeddata
3966 result['entries'] = feedparser.entries
3967 result['version'] = result['version'] or feedparser.version
3968 result['namespaces'] = feedparser.namespacesInUse
3969 return result
3970
3971 # The list of EPSG codes for geographic (latitude/longitude) coordinate
3972 # systems to support decoding of GeoRSS GML profiles.
3973 _geogCS = [
3974 3819, 3821, 3824, 3889, 3906, 4001, 4002, 4003, 4004, 4005, 4006, 4007, 4008,
3975 4009, 4010, 4011, 4012, 4013, 4014, 4015, 4016, 4018, 4019, 4020, 4021, 4022,
3976 4023, 4024, 4025, 4027, 4028, 4029, 4030, 4031, 4032, 4033, 4034, 4035, 4036,
3977 4041, 4042, 4043, 4044, 4045, 4046, 4047, 4052, 4053, 4054, 4055, 4075, 4081,
3978 4120, 4121, 4122, 4123, 4124, 4125, 4126, 4127, 4128, 4129, 4130, 4131, 4132,
3979 4133, 4134, 4135, 4136, 4137, 4138, 4139, 4140, 4141, 4142, 4143, 4144, 4145,
3980 4146, 4147, 4148, 4149, 4150, 4151, 4152, 4153, 4154, 4155, 4156, 4157, 4158,
3981 4159, 4160, 4161, 4162, 4163, 4164, 4165, 4166, 4167, 4168, 4169, 4170, 4171,
3982 4172, 4173, 4174, 4175, 4176, 4178, 4179, 4180, 4181, 4182, 4183, 4184, 4185,
3983 4188, 4189, 4190, 4191, 4192, 4193, 4194, 4195, 4196, 4197, 4198, 4199, 4200,
3984 4201, 4202, 4203, 4204, 4205, 4206, 4207, 4208, 4209, 4210, 4211, 4212, 4213,
3985 4214, 4215, 4216, 4218, 4219, 4220, 4221, 4222, 4223, 4224, 4225, 4226, 4227,
3986 4228, 4229, 4230, 4231, 4232, 4233, 4234, 4235, 4236, 4237, 4238, 4239, 4240,
3987 4241, 4242, 4243, 4244, 4245, 4246, 4247, 4248, 4249, 4250, 4251, 4252, 4253,
3988 4254, 4255, 4256, 4257, 4258, 4259, 4260, 4261, 4262, 4263, 4264, 4265, 4266,
3989 4267, 4268, 4269, 4270, 4271, 4272, 4273, 4274, 4275, 4276, 4277, 4278, 4279,
3990 4280, 4281, 4282, 4283, 4284, 4285, 4286, 4287, 4288, 4289, 4291, 4292, 4293,
3991 4294, 4295, 4296, 4297, 4298, 4299, 4300, 4301, 4302, 4303, 4304, 4306, 4307,
3992 4308, 4309, 4310, 4311, 4312, 4313, 4314, 4315, 4316, 4317, 4318, 4319, 4322,
3993 4324, 4326, 4463, 4470, 4475, 4483, 4490, 4555, 4558, 4600, 4601, 4602, 4603,
3994 4604, 4605, 4606, 4607, 4608, 4609, 4610, 4611, 4612, 4613, 4614, 4615, 4616,
3995 4617, 4618, 4619, 4620, 4621, 4622, 4623, 4624, 4625, 4626, 4627, 4628, 4629,
3996 4630, 4631, 4632, 4633, 4634, 4635, 4636, 4637, 4638, 4639, 4640, 4641, 4642,
3997 4643, 4644, 4645, 4646, 4657, 4658, 4659, 4660, 4661, 4662, 4663, 4664, 4665,
3998 4666, 4667, 4668, 4669, 4670, 4671, 4672, 4673, 4674, 4675, 4676, 4677, 4678,
3999 4679, 4680, 4681, 4682, 4683, 4684, 4685, 4686, 4687, 4688, 4689, 4690, 4691,
4000 4692, 4693, 4694, 4695, 4696, 4697, 4698, 4699, 4700, 4701, 4702, 4703, 4704,
4001 4705, 4706, 4707, 4708, 4709, 4710, 4711, 4712, 4713, 4714, 4715, 4716, 4717,
4002 4718, 4719, 4720, 4721, 4722, 4723, 4724, 4725, 4726, 4727, 4728, 4729, 4730,
4003 4731, 4732, 4733, 4734, 4735, 4736, 4737, 4738, 4739, 4740, 4741, 4742, 4743,
4004 4744, 4745, 4746, 4747, 4748, 4749, 4750, 4751, 4752, 4753, 4754, 4755, 4756,
4005 4757, 4758, 4759, 4760, 4761, 4762, 4763, 4764, 4765, 4801, 4802, 4803, 4804,
4006 4805, 4806, 4807, 4808, 4809, 4810, 4811, 4813, 4814, 4815, 4816, 4817, 4818,
4007 4819, 4820, 4821, 4823, 4824, 4901, 4902, 4903, 4904, 4979 ]
4008