feed.py - zs - Zeitungsschau rss to email converter
 (HTM) git clone git://r-36.net/zs
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
       feed.py (10854B)
       ---
            1 #
            2 # See LICENSE for licensing details.
            3 #
            4 # Copy me if you can.
            5 # by 20h
            6 #
            7 
            8 import lxml
            9 import lxml.objectify
           10 import html
           11 from datetime import datetime
           12 import dateutil.parser
           13 from dateutil.tz import gettz
           14 import requests
           15 import hashlib
           16 import pytz
           17 import codecs
           18 import urllib.parse
           19 import socket
           20 import json
           21 import pytz
           22 
           23 def parseiso(dstr, now):
           24         def gettzinfo(zone, offset):
           25                 try:
           26                         return gettz(zone)
           27                 except:
           28                         return None
           29 
           30         try:
           31                 return dateutil.parser.parse(str(dstr), default=now,
           32                                 tzinfos=gettzinfo)
           33         except:
           34                 # Invalid time format. Could not be parsed.
           35                 return now
           36 
           37 def removenamespaces(xml):
           38         for key in xml.nsmap:
           39                 nsstr = u'{%s}' % (xml.nsmap[key])
           40                 nsl = len(nsstr)
           41 
           42                 for elem in xml.getiterator():
           43                         if elem.tag.startswith(nsstr):
           44                                 elem.tag = elem.tag[nsl:]
           45 
           46 def parsexml(astr):
           47         xml = lxml.objectify.fromstring(html.unescape(astr.decode("utf-8")).encode("utf-8"))
           48         removenamespaces(xml)
           49         # Throw XML parsing errors so we can blame the feed authors.
           50         #print(lxml.objectify.dump(xml))
           51         return xml
           52 
           53 def parsetwtxtfeed(astr, uri):
           54         feed = {}
           55         articles = []
           56         now = datetime.now(pytz.utc)
           57         now = now.replace(hour=20, minute=20, second=20, microsecond=20)
           58 
           59         feed["title"] = uri
           60         feed["link"] = uri
           61         feed["updated"] = now
           62 
           63         lines = astr.split("\n");
           64         for line in lines:
           65                 # People already reinterpret the standard. :(
           66                 if len(line) == 0:
           67                         continue
           68                 if line[0] == "#":
           69                         continue
           70 
           71                 createdtxt, ltext = line.split("\t", 1)
           72                 created = parseiso(createdtxt, now)
           73 
           74                 article = {}
           75                 article["id"] = createdtxt
           76                 article["title"] = ltext
           77                 article["text"] = ltext
           78                 article["uuid"] = createdtxt
           79                 article["updated"] = created
           80 
           81                 if article["updated"] == now:
           82                         article["uuid"] = ""
           83                 else:
           84                         article["uuid"] = "%s" % (article["updated"])
           85 
           86                 articles.append(article)
           87 
           88         feed["articles"] = articles
           89 
           90         return feed
           91 
           92 def parsejsonfeed(astr):
           93         js = json.loads(astr)
           94 
           95         feed = {}
           96         articles = []
           97         now = datetime.now(pytz.utc)
           98         now = now.replace(hour=20, minute=20, second=20, microsecond=20)
           99 
          100         if "title" in js:
          101                 feed["title"] = js["title"]
          102         if "description" in js:
          103                 feed["description"] = js["description"]
          104         if "home_page_url" in js:
          105                 feed["link"] = js["home_page_url"]
          106         if "feed_url" in js:
          107                 feed["link"] = js["feed_url"]
          108         if "author" in js:
          109                 if "name" in js["author"]:
          110                         feed["author"] = js["author"]["name"]
          111         feed["updated"] = now
          112 
          113         if "items" in js:
          114                 for item in js["items"]:
          115                         article = {}
          116                         if "url" in item:
          117                                 article["file"] = item["url"]
          118                         if "title" in item:
          119                                 article["title"] = item["title"]
          120                         if "id" in item:
          121                                 article["id"] = item["id"]
          122                         else:
          123                                 if "link" in article:
          124                                         article["id"] = article["link"]
          125                                 elif "file" in article:
          126                                         article["id"] = article["file"]
          127                                 else:
          128                                         article["id"] = article["text"][:30]
          129 
          130                         if "summary" in item:
          131                                 article["text"] = html.unescape(item["summary"])
          132                         if "content_html" in item:
          133                                 article["text"] = html.unescape(item["content_html"])
          134                         if "content_text" in item:
          135                                 article["text"] = html.unescape(item["content_text"])
          136                         if "date_published" in item:
          137                                 article["updated"] = \
          138                                         dateutil.parser.parse(item["date_published"])
          139                         else:
          140                                 article["updated"] = now
          141 
          142                         if article["updated"] == now:
          143                                 article["uuid"] = ""
          144                         else:
          145                                 article["uuid"] = "%s" % (article["updated"])
          146 
          147                         for e in ("id", "title", "file"):
          148                                 if e in article:
          149                                         article["uuid"] = "%s-%s" % \
          150                                                 (article["uuid"],\
          151                                                  article[e])
          152 
          153                         def mkuuid(s):
          154                                 return hashlib.sha256(str(s).\
          155                                         encode("utf8")).hexdigest()
          156                         if len(article["uuid"]) == 0:
          157                                 article["uuid"] = mkuuid(now)
          158                         else:
          159                                 article["uuid"] = mkuuid(article["uuid"])
          160 
          161                         # sanity checks
          162                         if "title" not in article and "text" not in article \
          163                                         and "file" not in article:
          164                                 continue
          165 
          166                         articles.append(article)
          167 
          168         feed["articles"] = articles
          169 
          170         return feed
          171 
          172 def parseatomfeed(astr):
          173         xml = parsexml(astr)
          174         if xml == None:
          175                 return None
          176 
          177         feed = {}
          178         articles = []
          179         isrss = False
          180         isrdf = False
          181         now = datetime.now(pytz.utc)
          182         now = now.replace(hour=20, minute=20, second=20, microsecond=20)
          183 
          184         if hasattr(xml, "channel"):
          185                 if hasattr(xml, "item"):
          186                         isrdf = True
          187                         oxml = xml
          188                 xml = xml.channel
          189                 isrss = True
          190 
          191         feed["title"] = ""
          192         for e in ("title", "description"):
          193                 if hasattr(xml, e):
          194                         feed[e] = html.unescape(str(xml[e]))
          195         
          196         if hasattr(xml, "image") and hasattr(xml.image, "title"):
          197                 if "title" not in feed:
          198                         feed["title"] = html.unescape(str(xml.image.title))
          199 
          200         if hasattr(xml, "updated"):
          201                 feed["updated"] = parseiso(xml.updated, now) 
          202         elif hasattr(xml, "pubDate"):
          203                 feed["updated"] = parseiso(xml.pubDate, now)
          204         elif hasattr(xml, "lastBuildDate"):
          205                 feed["updated"] = parseiso(xml.lastBuildDate, now)
          206         else:
          207                 feed["updated"] = now
          208 
          209         if hasattr(xml, "link"):
          210                 if "href" in xml.link.attrib:
          211                         feed["link"] = str(xml.link.attrib["href"])
          212                 else:
          213                         feed["link"] = str(xml.link)
          214 
          215         if hasattr(xml, "webmaster"):
          216                 feed["email"] = html.unescape(str(xml.webmaster))
          217         elif hasattr(xml, "owner") and hasattr(xml.owner, "email"):
          218                 feed["email"] = html.unescape(str(xml.owner.email))
          219         elif hasattr(xml, "author") and hasattr(xml.author, "email"):
          220                 feed["email"] = html.unescape(str(xml.author.email))
          221         elif hasattr(xml, "webMaster"):
          222                 feed["email"] = html.unescape(str(xml.webMaster))
          223         elif hasattr(xml, "managingeditor"):
          224                 feed["email"] = html.unescape(str(xml.managingeditor))
          225         elif hasattr(xml, "managingEditor"):
          226                 feed["email"] = html.unescape(str(xml.managingEditor))
          227 
          228         if hasattr(xml, "author"):
          229                 if hasattr(xml.author, "name"):
          230                         feed["author"] = html.unescape(str(xml.author.name))
          231                 else:
          232                         feed["author"] = html.unescape(str(xml.author))
          233         elif hasattr(xml, "creator"):
          234                 feed["author"] = html.unescape(str(xml.creator))
          235 
          236         entryname = "entry"
          237         if isrss == True or isrdf == True:
          238                 entryname = "item"
          239         if isrdf == True:
          240                 xml = oxml
          241         if hasattr(xml, entryname):
          242                 for entry in xml[entryname][:]:
          243                         article = {}
          244                         # title
          245                         if hasattr(entry, "title"):
          246                                 article["title"] = html.unescape(\
          247                                                 str(entry["title"]))
          248 
          249                         # link
          250                         if hasattr(entry, "link"):
          251                                 if "href" in entry.link.attrib:
          252                                         article["link"] = str(entry.link.attrib["href"])
          253                                 else:
          254                                         article["link"] = str(entry.link)
          255                         elif hasattr(entry, "source"):
          256                                 article["link"] = str(entry.source)
          257 
          258                         # enclosure
          259                         if hasattr(entry, "enclosure"):
          260                                 if "href" in entry.enclosure.attrib:
          261                                         article["file"] = \
          262                                                 str(entry.enclosure.attrib["href"])
          263                                 elif "url" in entry.enclosure.attrib:
          264                                         article["file"] = \
          265                                                 str(entry.enclosure.attrib["url"])
          266                                 else:
          267                                         article["file"] = str(entry.enclosure)
          268 
          269                         if hasattr(entry, "group") and \
          270                                         hasattr(entry.group, "content"):
          271                                 if "url" in entry.group.content:
          272                                         article["file"] = \
          273                                                 html.unescape(\
          274                                                 str(entry.group.content.\
          275                                                 attrib["file"]))
          276 
          277                         # updated
          278                         try:
          279                                 if hasattr(entry, "updated"):
          280                                         article["updated"] = parseiso(entry.updated,\
          281                                                         now)
          282                                 elif hasattr(entry, "temporary"):
          283                                         article["updated"] = now
          284                                 elif hasattr(entry, "pubDate"):
          285                                         article["updated"] = parseiso(entry.pubDate,\
          286                                                         now)
          287                                 elif hasattr(entry, "date"):
          288                                         article["updated"] = parseiso(entry.date, now)
          289                                 else:
          290                                         article["updated"] = now
          291                         except TypeError:
          292                                 # There was some error in parseiso.
          293                                 article["updated"] = now
          294 
          295                         # author
          296                         if hasattr(entry, "author"):
          297                                 if hasattr(entry.author, "name"):
          298                                         article["author"] = html.unescape(\
          299                                                         str(entry.author.name))
          300                                 else:
          301                                         article["author"] = html.unescape(\
          302                                                         str(entry.author))
          303                         elif hasattr(entry, "creator"):
          304                                 article["author"] = html.unescape(\
          305                                                 str(entry.creator))
          306 
          307                         # tags
          308                         if hasattr(entry, "category"):
          309                                 article["tags"] = []
          310                                 for cat in entry["category"][:]:
          311                                         article["tags"].append(\
          312                                                         html.unescape(\
          313                                                         str(cat)))
          314 
          315                         # text
          316                         # Don't unescape the text, it might contain HTML.
          317                         if hasattr(entry, "encoded"):
          318                                 article["text"] = str(entry.encoded)
          319                         elif hasattr(entry, "content"):
          320                                 article["text"] = str(entry.content)
          321                         elif hasattr(entry, "summary"):
          322                                 article["text"] = str(entry.summary)
          323                         elif hasattr(entry, "description"):
          324                                 article["text"] = str(entry.description)
          325 
          326                         # id
          327                         if hasattr(entry, "id"):
          328                                 article["id"] = str(entry["id"])
          329                         else:
          330                                 if "link" in article:
          331                                         article["id"] = article["link"]
          332                                 elif "file" in article:
          333                                         article["id"] = article["file"]
          334                                 else:
          335                                         article["id"] = article["text"][:30]
          336 
          337                         if article["updated"] == now:
          338                                 article["uuid"] = ""
          339                         else:
          340                                 article["uuid"] = "%s" % (article["updated"])
          341 
          342                         # Certain websites need exceptions due to their
          343                         # »programmers« being stupid.
          344                         if "link" in feed:
          345                                 if "youtube.com" in feed["link"]:
          346                                         article["uuid"] = ""
          347 
          348                         for e in ("id", "title", "file"):
          349                                 if e in article:
          350                                         article["uuid"] = "%s-%s" % \
          351                                                 (article["uuid"],\
          352                                                  article[e])
          353 
          354                         def mkuuid(s):
          355                                 return hashlib.sha256(str(s).\
          356                                         encode("utf8")).hexdigest()
          357                         if len(article["uuid"]) == 0:
          358                                 article["uuid"] = mkuuid(now)
          359                         else:
          360                                 article["uuid"] = mkuuid(article["uuid"])
          361 
          362                         # sanity checks
          363                         if "title" not in article and "text" not in article \
          364                                         and "file" not in article:
          365                                 continue
          366 
          367                         articles.append(article)
          368 
          369         try:
          370                 feed["articles"] = sorted(articles, key=lambda article: \
          371                                 article["updated"])
          372         except TypeError:
          373                 for article in articles:
          374                         print(article["updated"])
          375 
          376         return feed
          377 
          378 def fetch(uri):
          379         ftype = "xml"
          380         if "file://" in uri:
          381                 fd = codecs.open(uri[7:], "r", "utf-8")
          382                 fval = fd.read().encode("utf-8")
          383                 fd.close()
          384                 rcode = 200
          385         elif "gopher://" in uri:
          386                 urls = urllib.parse.urlparse(uri, allow_fragments=False)
          387                 if ":" in urls.netloc:
          388                         (host, port) = urls.netloc.split(":")
          389                 else:
          390                         host = urls.netloc
          391                         port = 70
          392                 if len(urls.path) > 2:
          393                         if len(urls.query) > 0:
          394                                 selector = "%s?%s" % (urls.path[2:], urls.query)
          395                         else:
          396                                 selector = urls.path[2:]
          397                 else:
          398                         selector = ""
          399 
          400                 s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
          401                 s.connect((host, port))
          402                 s.send(("%s\r\n" % (selector)).encode("utf-8"))
          403                 fd = s.makefile("r")
          404                 fval = fd.read().encode("utf-8")
          405                 s.close()
          406                 rcode = 200
          407         else:
          408                 fd = requests.get(uri, timeout=20,\
          409                         headers={"User-Agent": "Zeitungsschau/1.0"})
          410                 fval = fd.content
          411                 rcode = fd.status_code
          412 
          413                 if "Content-Type" in fd.headers:
          414                         if "application/json" in fd.headers["Content-Type"]:
          415                                 ftype = "json"
          416 
          417         if ftype == "xml":
          418                 suri = uri.lower().rsplit(".", 1)
          419                 if len(suri) > 1:
          420                         if suri[-1] == "json":
          421                                 ftype = "json"
          422                         elif suri[-1] == "txt":
          423                                 ftype = "twtxt"
          424 
          425         if ftype == "xml":
          426                 rval = (rcode, parseatomfeed(fval))
          427         elif ftype == "twtxt":
          428                 rval = (rcode, parsetwtxtfeed(fval.decode("utf-8"), uri))
          429         else:
          430                 rval = (rcode, parsejsonfeed(fval.decode("utf-8")))
          431         
          432         if rval[1] != None:
          433                 rval[1]["feeduri"] = uri
          434         
          435         return rval
          436