feed.py - zs - Zeitungsschau rss to email converter
(HTM) git clone git://r-36.net/zs
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
feed.py (10854B)
---
1 #
2 # See LICENSE for licensing details.
3 #
4 # Copy me if you can.
5 # by 20h
6 #
7
8 import lxml
9 import lxml.objectify
10 import html
11 from datetime import datetime
12 import dateutil.parser
13 from dateutil.tz import gettz
14 import requests
15 import hashlib
16 import pytz
17 import codecs
18 import urllib.parse
19 import socket
20 import json
21 import pytz
22
23 def parseiso(dstr, now):
24 def gettzinfo(zone, offset):
25 try:
26 return gettz(zone)
27 except:
28 return None
29
30 try:
31 return dateutil.parser.parse(str(dstr), default=now,
32 tzinfos=gettzinfo)
33 except:
34 # Invalid time format. Could not be parsed.
35 return now
36
37 def removenamespaces(xml):
38 for key in xml.nsmap:
39 nsstr = u'{%s}' % (xml.nsmap[key])
40 nsl = len(nsstr)
41
42 for elem in xml.getiterator():
43 if elem.tag.startswith(nsstr):
44 elem.tag = elem.tag[nsl:]
45
46 def parsexml(astr):
47 xml = lxml.objectify.fromstring(html.unescape(astr.decode("utf-8")).encode("utf-8"))
48 removenamespaces(xml)
49 # Throw XML parsing errors so we can blame the feed authors.
50 #print(lxml.objectify.dump(xml))
51 return xml
52
53 def parsetwtxtfeed(astr, uri):
54 feed = {}
55 articles = []
56 now = datetime.now(pytz.utc)
57 now = now.replace(hour=20, minute=20, second=20, microsecond=20)
58
59 feed["title"] = uri
60 feed["link"] = uri
61 feed["updated"] = now
62
63 lines = astr.split("\n");
64 for line in lines:
65 # People already reinterpret the standard. :(
66 if len(line) == 0:
67 continue
68 if line[0] == "#":
69 continue
70
71 createdtxt, ltext = line.split("\t", 1)
72 created = parseiso(createdtxt, now)
73
74 article = {}
75 article["id"] = createdtxt
76 article["title"] = ltext
77 article["text"] = ltext
78 article["uuid"] = createdtxt
79 article["updated"] = created
80
81 if article["updated"] == now:
82 article["uuid"] = ""
83 else:
84 article["uuid"] = "%s" % (article["updated"])
85
86 articles.append(article)
87
88 feed["articles"] = articles
89
90 return feed
91
92 def parsejsonfeed(astr):
93 js = json.loads(astr)
94
95 feed = {}
96 articles = []
97 now = datetime.now(pytz.utc)
98 now = now.replace(hour=20, minute=20, second=20, microsecond=20)
99
100 if "title" in js:
101 feed["title"] = js["title"]
102 if "description" in js:
103 feed["description"] = js["description"]
104 if "home_page_url" in js:
105 feed["link"] = js["home_page_url"]
106 if "feed_url" in js:
107 feed["link"] = js["feed_url"]
108 if "author" in js:
109 if "name" in js["author"]:
110 feed["author"] = js["author"]["name"]
111 feed["updated"] = now
112
113 if "items" in js:
114 for item in js["items"]:
115 article = {}
116 if "url" in item:
117 article["file"] = item["url"]
118 if "title" in item:
119 article["title"] = item["title"]
120 if "id" in item:
121 article["id"] = item["id"]
122 else:
123 if "link" in article:
124 article["id"] = article["link"]
125 elif "file" in article:
126 article["id"] = article["file"]
127 else:
128 article["id"] = article["text"][:30]
129
130 if "summary" in item:
131 article["text"] = html.unescape(item["summary"])
132 if "content_html" in item:
133 article["text"] = html.unescape(item["content_html"])
134 if "content_text" in item:
135 article["text"] = html.unescape(item["content_text"])
136 if "date_published" in item:
137 article["updated"] = \
138 dateutil.parser.parse(item["date_published"])
139 else:
140 article["updated"] = now
141
142 if article["updated"] == now:
143 article["uuid"] = ""
144 else:
145 article["uuid"] = "%s" % (article["updated"])
146
147 for e in ("id", "title", "file"):
148 if e in article:
149 article["uuid"] = "%s-%s" % \
150 (article["uuid"],\
151 article[e])
152
153 def mkuuid(s):
154 return hashlib.sha256(str(s).\
155 encode("utf8")).hexdigest()
156 if len(article["uuid"]) == 0:
157 article["uuid"] = mkuuid(now)
158 else:
159 article["uuid"] = mkuuid(article["uuid"])
160
161 # sanity checks
162 if "title" not in article and "text" not in article \
163 and "file" not in article:
164 continue
165
166 articles.append(article)
167
168 feed["articles"] = articles
169
170 return feed
171
172 def parseatomfeed(astr):
173 xml = parsexml(astr)
174 if xml == None:
175 return None
176
177 feed = {}
178 articles = []
179 isrss = False
180 isrdf = False
181 now = datetime.now(pytz.utc)
182 now = now.replace(hour=20, minute=20, second=20, microsecond=20)
183
184 if hasattr(xml, "channel"):
185 if hasattr(xml, "item"):
186 isrdf = True
187 oxml = xml
188 xml = xml.channel
189 isrss = True
190
191 feed["title"] = ""
192 for e in ("title", "description"):
193 if hasattr(xml, e):
194 feed[e] = html.unescape(str(xml[e]))
195
196 if hasattr(xml, "image") and hasattr(xml.image, "title"):
197 if "title" not in feed:
198 feed["title"] = html.unescape(str(xml.image.title))
199
200 if hasattr(xml, "updated"):
201 feed["updated"] = parseiso(xml.updated, now)
202 elif hasattr(xml, "pubDate"):
203 feed["updated"] = parseiso(xml.pubDate, now)
204 elif hasattr(xml, "lastBuildDate"):
205 feed["updated"] = parseiso(xml.lastBuildDate, now)
206 else:
207 feed["updated"] = now
208
209 if hasattr(xml, "link"):
210 if "href" in xml.link.attrib:
211 feed["link"] = str(xml.link.attrib["href"])
212 else:
213 feed["link"] = str(xml.link)
214
215 if hasattr(xml, "webmaster"):
216 feed["email"] = html.unescape(str(xml.webmaster))
217 elif hasattr(xml, "owner") and hasattr(xml.owner, "email"):
218 feed["email"] = html.unescape(str(xml.owner.email))
219 elif hasattr(xml, "author") and hasattr(xml.author, "email"):
220 feed["email"] = html.unescape(str(xml.author.email))
221 elif hasattr(xml, "webMaster"):
222 feed["email"] = html.unescape(str(xml.webMaster))
223 elif hasattr(xml, "managingeditor"):
224 feed["email"] = html.unescape(str(xml.managingeditor))
225 elif hasattr(xml, "managingEditor"):
226 feed["email"] = html.unescape(str(xml.managingEditor))
227
228 if hasattr(xml, "author"):
229 if hasattr(xml.author, "name"):
230 feed["author"] = html.unescape(str(xml.author.name))
231 else:
232 feed["author"] = html.unescape(str(xml.author))
233 elif hasattr(xml, "creator"):
234 feed["author"] = html.unescape(str(xml.creator))
235
236 entryname = "entry"
237 if isrss == True or isrdf == True:
238 entryname = "item"
239 if isrdf == True:
240 xml = oxml
241 if hasattr(xml, entryname):
242 for entry in xml[entryname][:]:
243 article = {}
244 # title
245 if hasattr(entry, "title"):
246 article["title"] = html.unescape(\
247 str(entry["title"]))
248
249 # link
250 if hasattr(entry, "link"):
251 if "href" in entry.link.attrib:
252 article["link"] = str(entry.link.attrib["href"])
253 else:
254 article["link"] = str(entry.link)
255 elif hasattr(entry, "source"):
256 article["link"] = str(entry.source)
257
258 # enclosure
259 if hasattr(entry, "enclosure"):
260 if "href" in entry.enclosure.attrib:
261 article["file"] = \
262 str(entry.enclosure.attrib["href"])
263 elif "url" in entry.enclosure.attrib:
264 article["file"] = \
265 str(entry.enclosure.attrib["url"])
266 else:
267 article["file"] = str(entry.enclosure)
268
269 if hasattr(entry, "group") and \
270 hasattr(entry.group, "content"):
271 if "url" in entry.group.content:
272 article["file"] = \
273 html.unescape(\
274 str(entry.group.content.\
275 attrib["file"]))
276
277 # updated
278 try:
279 if hasattr(entry, "updated"):
280 article["updated"] = parseiso(entry.updated,\
281 now)
282 elif hasattr(entry, "temporary"):
283 article["updated"] = now
284 elif hasattr(entry, "pubDate"):
285 article["updated"] = parseiso(entry.pubDate,\
286 now)
287 elif hasattr(entry, "date"):
288 article["updated"] = parseiso(entry.date, now)
289 else:
290 article["updated"] = now
291 except TypeError:
292 # There was some error in parseiso.
293 article["updated"] = now
294
295 # author
296 if hasattr(entry, "author"):
297 if hasattr(entry.author, "name"):
298 article["author"] = html.unescape(\
299 str(entry.author.name))
300 else:
301 article["author"] = html.unescape(\
302 str(entry.author))
303 elif hasattr(entry, "creator"):
304 article["author"] = html.unescape(\
305 str(entry.creator))
306
307 # tags
308 if hasattr(entry, "category"):
309 article["tags"] = []
310 for cat in entry["category"][:]:
311 article["tags"].append(\
312 html.unescape(\
313 str(cat)))
314
315 # text
316 # Don't unescape the text, it might contain HTML.
317 if hasattr(entry, "encoded"):
318 article["text"] = str(entry.encoded)
319 elif hasattr(entry, "content"):
320 article["text"] = str(entry.content)
321 elif hasattr(entry, "summary"):
322 article["text"] = str(entry.summary)
323 elif hasattr(entry, "description"):
324 article["text"] = str(entry.description)
325
326 # id
327 if hasattr(entry, "id"):
328 article["id"] = str(entry["id"])
329 else:
330 if "link" in article:
331 article["id"] = article["link"]
332 elif "file" in article:
333 article["id"] = article["file"]
334 else:
335 article["id"] = article["text"][:30]
336
337 if article["updated"] == now:
338 article["uuid"] = ""
339 else:
340 article["uuid"] = "%s" % (article["updated"])
341
342 # Certain websites need exceptions due to their
343 # »programmers« being stupid.
344 if "link" in feed:
345 if "youtube.com" in feed["link"]:
346 article["uuid"] = ""
347
348 for e in ("id", "title", "file"):
349 if e in article:
350 article["uuid"] = "%s-%s" % \
351 (article["uuid"],\
352 article[e])
353
354 def mkuuid(s):
355 return hashlib.sha256(str(s).\
356 encode("utf8")).hexdigest()
357 if len(article["uuid"]) == 0:
358 article["uuid"] = mkuuid(now)
359 else:
360 article["uuid"] = mkuuid(article["uuid"])
361
362 # sanity checks
363 if "title" not in article and "text" not in article \
364 and "file" not in article:
365 continue
366
367 articles.append(article)
368
369 try:
370 feed["articles"] = sorted(articles, key=lambda article: \
371 article["updated"])
372 except TypeError:
373 for article in articles:
374 print(article["updated"])
375
376 return feed
377
378 def fetch(uri):
379 ftype = "xml"
380 if "file://" in uri:
381 fd = codecs.open(uri[7:], "r", "utf-8")
382 fval = fd.read().encode("utf-8")
383 fd.close()
384 rcode = 200
385 elif "gopher://" in uri:
386 urls = urllib.parse.urlparse(uri, allow_fragments=False)
387 if ":" in urls.netloc:
388 (host, port) = urls.netloc.split(":")
389 else:
390 host = urls.netloc
391 port = 70
392 if len(urls.path) > 2:
393 if len(urls.query) > 0:
394 selector = "%s?%s" % (urls.path[2:], urls.query)
395 else:
396 selector = urls.path[2:]
397 else:
398 selector = ""
399
400 s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
401 s.connect((host, port))
402 s.send(("%s\r\n" % (selector)).encode("utf-8"))
403 fd = s.makefile("r")
404 fval = fd.read().encode("utf-8")
405 s.close()
406 rcode = 200
407 else:
408 fd = requests.get(uri, timeout=20,\
409 headers={"User-Agent": "Zeitungsschau/1.0"})
410 fval = fd.content
411 rcode = fd.status_code
412
413 if "Content-Type" in fd.headers:
414 if "application/json" in fd.headers["Content-Type"]:
415 ftype = "json"
416
417 if ftype == "xml":
418 suri = uri.lower().rsplit(".", 1)
419 if len(suri) > 1:
420 if suri[-1] == "json":
421 ftype = "json"
422 elif suri[-1] == "txt":
423 ftype = "twtxt"
424
425 if ftype == "xml":
426 rval = (rcode, parseatomfeed(fval))
427 elif ftype == "twtxt":
428 rval = (rcode, parsetwtxtfeed(fval.decode("utf-8"), uri))
429 else:
430 rval = (rcode, parsejsonfeed(fval.decode("utf-8")))
431
432 if rval[1] != None:
433 rval[1]["feeduri"] = uri
434
435 return rval
436