tParse subjects for HTML content and clean it. - zs - Zeitungsschau rss to email converter
 (HTM) git clone git://r-36.net/zs
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) LICENSE
       ---
 (DIR) commit cc1e0defe58a83c1d59a31d72b7e8e7dec726883
 (DIR) parent ee341c7915c2d85c90eb6deef11c964fb88986fa
 (HTM) Author: Christoph Lohmann <20h@r-36.net>
       Date:   Wed, 24 Jan 2018 14:07:16 +0100
       
       Parse subjects for HTML content and clean it.
       
       Some RSS feeds have escaped HTML in escaped HTML in subjects. We need to
       clean this mess up manually on our side.
       
       Diffstat:
         zeitungsschau/feedemail.py          |       7 +++++--
       
       1 file changed, 5 insertions(+), 2 deletions(-)
       ---
 (DIR) diff --git a/zeitungsschau/feedemail.py b/zeitungsschau/feedemail.py
       t@@ -12,11 +12,14 @@ from email.utils import formataddr, formatdate, parseaddr
        from email.header import Header
        import time
        import subprocess
       +import lxml.html
        
        import html2text
        
        def normalizeheader(hstr):
       -        return hstr.replace("\n", " ").strip()
       +        return lxml.html.fromstring(hstr).text_content().\
       +                        replace(u"\xa0", "").\
       +                        replace("\n", " ").strip()
        
        class LocalSendmail(object):
                cmd="/usr/sbin/sendmail -f \"%s\" \"%s\""
       t@@ -58,7 +61,7 @@ def send(feed, to, smtphost="localhost", smtpport=None, ssl="False",\
                                                normalizeheader(article["title"]),\
                                                "utf-8")
                        else:
       -                        subject = Header(normalizeheader(text[:70]),\
       +                        subject = Header(normalizeheader(text[:20]),\
                                                "utf-8")
        
                        # Append metadata.