From 04dcf07fb24fd0fa3f1ad6b87d504f43a8424368 Mon Sep 17 00:00:00 2001
From: nemunaire <nemunaire@nemunai.re>
Date: Sat, 10 Oct 2015 15:55:51 +0200
Subject: [PATCH] tools/web: use standard unescape instead of custom function
 when available

---
 nemubot/tools/web.py | 50 +++++++++++++++++++++++++-------------------
 1 file changed, 28 insertions(+), 22 deletions(-)

diff --git a/nemubot/tools/web.py b/nemubot/tools/web.py
index b1d146a..35e9e7b 100644
--- a/nemubot/tools/web.py
+++ b/nemubot/tools/web.py
@@ -192,20 +192,6 @@ def getJSON(url, timeout=15):
 
 # Other utils
 
-def htmlentitydecode(s):
-    """Decode htmlentities
-
-    Argument:
-    s -- The string to decode
-    """
-
-    import re
-    from html.entities import name2codepoint
-
-    return re.sub('&(%s);' % '|'.join(name2codepoint),
-                  lambda m: chr(name2codepoint[m.group(1)]), s)
-
-
 def striphtml(data):
     """Remove HTML tags from text
 
@@ -213,15 +199,35 @@ def striphtml(data):
     data -- the string to strip
     """
 
-    if data is None or (not isinstance(data, str) and not isinstance(data, buffer)):
+    if not isinstance(data, str) and not isinstance(data, buffer):
         return data
 
+    try:
+        from html import unescape
+    except ImportError:
+        def _replace_charref(s):
+            s = s.group(1)
+
+            if s[0] == '#':
+                if s[1] in 'xX':
+                    return chr(int(s[2:], 16))
+                else:
+                    return chr(int(s[2:]))
+            else:
+                from html.entities import name2codepoint
+                return chr(name2codepoint[s])
+
+        # unescape exists from Python 3.4
+        def unescape(s):
+            if '&' not in s:
+                return s
+
+            import re
+
+            return re.sub('&([^;]+);', _replace_charref, s)
+
+
     import re
-    p = re.compile(r'<.*?>')
-    r, _ = re.subn(r' +', ' ', htmlentitydecode(p.sub('', data)
-                                                .replace("&#x28;", "/(")
-                                                .replace("&#x29;", ")/")
-                                                .replace("&#39;", "´")
-                                                .replace("&#x22;", "\""))
-                   .replace('\n', ' '))
+    r, _ = re.subn(r' +', ' ',
+                   unescape(re.sub(r'<.*?>', '', data)).replace('\n', ' '))
     return r