iSimplify parsing and unescape text entries. - zs - Zeitungsschau rss to email converter Err gopher.r-36.net 70 i Err gopher.r-36.net 70 1Log /scm/zs//log.gph gopher.r-36.net 70 1Files /scm/zs//files.gph gopher.r-36.net 70 1Refs /scm/zs//refs.gph gopher.r-36.net 70 1LICENSE /scm/zs//file/LICENSE.gph gopher.r-36.net 70 i--- Err gopher.r-36.net 70 1commit 9891ca73640aa4fa074c54e92913f847ba1e756b /scm/zs//commit/9891ca73640aa4fa074c54e92913f847ba1e756b.gph gopher.r-36.net 70 1parent 9e95a0f332a1bfabfba59c9bad6460e70731db9f /scm/zs//commit/9e95a0f332a1bfabfba59c9bad6460e70731db9f.gph gopher.r-36.net 70 hAuthor: Christoph Lohmann <20h@r-36.net> URL:mailto:20h@r-36.net gopher.r-36.net 70 iDate: Wed, 11 Nov 2015 22:08:35 +0100 Err gopher.r-36.net 70 i Err gopher.r-36.net 70 iSimplify parsing and unescape text entries. Err gopher.r-36.net 70 i Err gopher.r-36.net 70 iDiffstat: Err gopher.r-36.net 70 i zeitungsschau/feed.py | 62 +++++++++++++++---------------- Err gopher.r-36.net 70 i Err gopher.r-36.net 70 i1 file changed, 29 insertions(+), 33 deletions(-) Err gopher.r-36.net 70 i--- Err gopher.r-36.net 70 1diff --git a/zeitungsschau/feed.py b/zeitungsschau/feed.py /scm/zs//file/zeitungsschau/feed.py.gph gopher.r-36.net 70 i@@ -13,6 +13,7 @@ import requests Err gopher.r-36.net 70 i import hashlib Err gopher.r-36.net 70 i import pytz Err gopher.r-36.net 70 i import codecs Err gopher.r-36.net 70 i+import html Err gopher.r-36.net 70 i Err gopher.r-36.net 70 i def parseiso(dstr, now): Err gopher.r-36.net 70 i try: Err gopher.r-36.net 70 i@@ -32,18 +33,9 @@ def removenamespaces(xml): Err gopher.r-36.net 70 i elem.tag = elem.tag[nsl:] Err gopher.r-36.net 70 i Err gopher.r-36.net 70 i def parsexml(astr): Err gopher.r-36.net 70 i- try: Err gopher.r-36.net 70 i- xml = objectify.fromstring(astr) Err gopher.r-36.net 70 i- removenamespaces(xml) Err gopher.r-36.net 70 i- except etree.XMLSyntaxError: Err gopher.r-36.net 70 i- try: Err gopher.r-36.net 70 i- parser = etree.HTMLParser() Err gopher.r-36.net 70 i- xml = objectify.fromstring(astr, parser) Err gopher.r-36.net 70 i- removenamespaces(xml) Err gopher.r-36.net 70 i- except etree.XMLSyntaxError: Err gopher.r-36.net 70 i- parser = etree.XMLParser(resolve_entities=False) Err gopher.r-36.net 70 i- xml = objectify.fromstring(astr, parser) Err gopher.r-36.net 70 i- removenamespaces(xml) Err gopher.r-36.net 70 i+ xml = objectify.fromstring(astr) Err gopher.r-36.net 70 i+ removenamespaces(xml) Err gopher.r-36.net 70 i+ # Throw XML parsing errors so we can blame the feed authors. Err gopher.r-36.net 70 i return xml Err gopher.r-36.net 70 i Err gopher.r-36.net 70 i def parse(astr): Err gopher.r-36.net 70 i@@ -57,10 +49,6 @@ def parse(astr): Err gopher.r-36.net 70 i isrdf = False Err gopher.r-36.net 70 i now = datetime.now(pytz.utc) Err gopher.r-36.net 70 i Err gopher.r-36.net 70 i- feede = xml.xpath(".//feed") Err gopher.r-36.net 70 i- if len(feede) > 0: Err gopher.r-36.net 70 i- xml = feede[0] Err gopher.r-36.net 70 i- Err gopher.r-36.net 70 i if hasattr(xml, "channel"): Err gopher.r-36.net 70 i if hasattr(xml, "item"): Err gopher.r-36.net 70 i isrdf = True Err gopher.r-36.net 70 i@@ -71,11 +59,11 @@ def parse(astr): Err gopher.r-36.net 70 i feed["title"] = "" Err gopher.r-36.net 70 i for e in ("title", "description"): Err gopher.r-36.net 70 i if hasattr(xml, e): Err gopher.r-36.net 70 i- feed[e] = str(xml[e]) Err gopher.r-36.net 70 i+ feed[e] = html.unescape(str(xml[e])) Err gopher.r-36.net 70 i Err gopher.r-36.net 70 i if hasattr(xml, "image") and hasattr(xml.image, "title"): Err gopher.r-36.net 70 i if "title" not in feed: Err gopher.r-36.net 70 i- feed["title"] = str(xml.image.title) Err gopher.r-36.net 70 i+ feed["title"] = html.unescape(str(xml.image.title)) Err gopher.r-36.net 70 i Err gopher.r-36.net 70 i if hasattr(xml, "updated"): Err gopher.r-36.net 70 i feed["updated"] = parseiso(xml.updated, now) Err gopher.r-36.net 70 i@@ -93,25 +81,25 @@ def parse(astr): Err gopher.r-36.net 70 i feed["link"] = str(xml.link) Err gopher.r-36.net 70 i Err gopher.r-36.net 70 i if hasattr(xml, "webmaster"): Err gopher.r-36.net 70 i- feed["email"] = str(xml.webmaster) Err gopher.r-36.net 70 i+ feed["email"] = html.unescape(str(xml.webmaster)) Err gopher.r-36.net 70 i elif hasattr(xml, "owner") and hasattr(xml.owner, "email"): Err gopher.r-36.net 70 i- feed["email"] = str(xml.owner.email) Err gopher.r-36.net 70 i+ feed["email"] = html.unescape(str(xml.owner.email)) Err gopher.r-36.net 70 i elif hasattr(xml, "author") and hasattr(xml.author, "email"): Err gopher.r-36.net 70 i- feed["email"] = str(xml.author.email) Err gopher.r-36.net 70 i+ feed["email"] = html.unescape(str(xml.author.email)) Err gopher.r-36.net 70 i elif hasattr(xml, "webMaster"): Err gopher.r-36.net 70 i- feed["email"] = str(xml.webMaster) Err gopher.r-36.net 70 i+ feed["email"] = html.unescape(str(xml.webMaster)) Err gopher.r-36.net 70 i elif hasattr(xml, "managingeditor"): Err gopher.r-36.net 70 i- feed["email"] = str(xml.managingeditor) Err gopher.r-36.net 70 i+ feed["email"] = html.unescape(str(xml.managingeditor)) Err gopher.r-36.net 70 i elif hasattr(xml, "managingEditor"): Err gopher.r-36.net 70 i- feed["email"] = str(xml.managingEditor) Err gopher.r-36.net 70 i+ feed["email"] = html.unescape(str(xml.managingEditor)) Err gopher.r-36.net 70 i Err gopher.r-36.net 70 i if hasattr(xml, "author"): Err gopher.r-36.net 70 i if hasattr(xml.author, "name"): Err gopher.r-36.net 70 i- feed["author"] = str(xml.author.name) Err gopher.r-36.net 70 i+ feed["author"] = html.unescape(str(xml.author.name)) Err gopher.r-36.net 70 i else: Err gopher.r-36.net 70 i- feed["author"] = str(xml.author) Err gopher.r-36.net 70 i+ feed["author"] = html.unescape(str(xml.author)) Err gopher.r-36.net 70 i elif hasattr(xml, "creator"): Err gopher.r-36.net 70 i- feed["author"] = str(xml.creator) Err gopher.r-36.net 70 i+ feed["author"] = html.unescape(str(xml.creator)) Err gopher.r-36.net 70 i Err gopher.r-36.net 70 i entryname = "entry" Err gopher.r-36.net 70 i if isrss == True or isrdf == True: Err gopher.r-36.net 70 i@@ -123,7 +111,8 @@ def parse(astr): Err gopher.r-36.net 70 i article = {} Err gopher.r-36.net 70 i # title Err gopher.r-36.net 70 i if hasattr(entry, "title"): Err gopher.r-36.net 70 i- article["title"] = str(entry["title"]) Err gopher.r-36.net 70 i+ article["title"] = html.unescape(\ Err gopher.r-36.net 70 i+ str(entry["title"])) Err gopher.r-36.net 70 i Err gopher.r-36.net 70 i # link Err gopher.r-36.net 70 i if hasattr(entry, "link"): Err gopher.r-36.net 70 i@@ -149,8 +138,9 @@ def parse(astr): Err gopher.r-36.net 70 i hasattr(entry.group, "content"): Err gopher.r-36.net 70 i if "url" in entry.group.content: Err gopher.r-36.net 70 i article["file"] = \ Err gopher.r-36.net 70 i+ html.unescape(\ Err gopher.r-36.net 70 i str(entry.group.content.\ Err gopher.r-36.net 70 i- attrib["file"]) Err gopher.r-36.net 70 i+ attrib["file"])) Err gopher.r-36.net 70 i Err gopher.r-36.net 70 i # updated Err gopher.r-36.net 70 i try: Err gopher.r-36.net 70 i@@ -171,19 +161,25 @@ def parse(astr): Err gopher.r-36.net 70 i # author Err gopher.r-36.net 70 i if hasattr(entry, "author"): Err gopher.r-36.net 70 i if hasattr(entry.author, "name"): Err gopher.r-36.net 70 i- article["author"] = str(entry.author.name) Err gopher.r-36.net 70 i+ article["author"] = html.unescape(\ Err gopher.r-36.net 70 i+ str(entry.author.name)) Err gopher.r-36.net 70 i else: Err gopher.r-36.net 70 i- article["author"] = str(entry.author) Err gopher.r-36.net 70 i+ article["author"] = html.unescape(\ Err gopher.r-36.net 70 i+ str(entry.author)) Err gopher.r-36.net 70 i elif hasattr(entry, "creator"): Err gopher.r-36.net 70 i- article["author"] = str(entry.creator) Err gopher.r-36.net 70 i+ article["author"] = html.unescape(\ Err gopher.r-36.net 70 i+ str(entry.creator)) Err gopher.r-36.net 70 i Err gopher.r-36.net 70 i # tags Err gopher.r-36.net 70 i if hasattr(entry, "category"): Err gopher.r-36.net 70 i article["tags"] = [] Err gopher.r-36.net 70 i for cat in entry["category"][:]: Err gopher.r-36.net 70 i- article["tags"].append(str(cat)) Err gopher.r-36.net 70 i+ article["tags"].append(\ Err gopher.r-36.net 70 i+ html.unescape(\ Err gopher.r-36.net 70 i+ str(cat))) Err gopher.r-36.net 70 i Err gopher.r-36.net 70 i # text Err gopher.r-36.net 70 i+ # Don't unescape the text, it might contain HTML. Err gopher.r-36.net 70 i if hasattr(entry, "encoded"): Err gopher.r-36.net 70 i article["text"] = str(entry.encoded) Err gopher.r-36.net 70 i elif hasattr(entry, "content"): Err gopher.r-36.net 70 .