From 04dcf07fb24fd0fa3f1ad6b87d504f43a8424368 Mon Sep 17 00:00:00 2001 From: nemunaire Date: Sat, 10 Oct 2015 15:55:51 +0200 Subject: [PATCH] tools/web: use standard unescape instead of custom function when available --- nemubot/tools/web.py | 50 +++++++++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 22 deletions(-) diff --git a/nemubot/tools/web.py b/nemubot/tools/web.py index b1d146a..35e9e7b 100644 --- a/nemubot/tools/web.py +++ b/nemubot/tools/web.py @@ -192,20 +192,6 @@ def getJSON(url, timeout=15): # Other utils -def htmlentitydecode(s): - """Decode htmlentities - - Argument: - s -- The string to decode - """ - - import re - from html.entities import name2codepoint - - return re.sub('&(%s);' % '|'.join(name2codepoint), - lambda m: chr(name2codepoint[m.group(1)]), s) - - def striphtml(data): """Remove HTML tags from text @@ -213,15 +199,35 @@ def striphtml(data): data -- the string to strip """ - if data is None or (not isinstance(data, str) and not isinstance(data, buffer)): + if not isinstance(data, str) and not isinstance(data, buffer): return data + try: + from html import unescape + except ImportError: + def _replace_charref(s): + s = s.group(1) + + if s[0] == '#': + if s[1] in 'xX': + return chr(int(s[2:], 16)) + else: + return chr(int(s[2:])) + else: + from html.entities import name2codepoint + return chr(name2codepoint[s]) + + # unescape exists from Python 3.4 + def unescape(s): + if '&' not in s: + return s + + import re + + return re.sub('&([^;]+);', _replace_charref, s) + + import re - p = re.compile(r'<.*?>') - r, _ = re.subn(r' +', ' ', htmlentitydecode(p.sub('', data) - .replace("(", "/(") - .replace(")", ")/") - .replace("'", "ยด") - .replace(""", "\"")) - .replace('\n', ' ')) + r, _ = re.subn(r' +', ' ', + unescape(re.sub(r'<.*?>', '', data)).replace('\n', ' ')) return r