Normalize unicode - toot - Unnamed repository; edit this file 'description' to name the repository.
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) LICENSE
---
(DIR) commit 2ecc6a28c6b1cd2efd4bd94d801954e87ab1b320
(DIR) parent cb1f7b4e61e66ceecf91fe286ac9f44166ef3b25
(HTM) Author: Ivan Habunek <ivan@habunek.com>
Date: Sun, 21 Jan 2018 16:39:40 +0100
Normalize unicode
Diffstat:
toot/utils.py | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
---
(DIR) diff --git a/toot/utils.py b/toot/utils.py
@@ -2,6 +2,7 @@
import re
import socket
+import unicodedata
from bs4 import BeautifulSoup
@@ -10,7 +11,9 @@ from toot.exceptions import ConsoleError
def get_text(html):
"""Converts html to text, strips all tags."""
- return BeautifulSoup(html, "html.parser").get_text().replace(''', "'")
+ text = BeautifulSoup(html, "html.parser").get_text().replace(''', "'")
+
+ return unicodedata.normalize('NFKC', text)
def parse_html(html):