Tools.web: use urllib instead of custom urlparser

2013-01-24 14:18:24 +01:00 · 2013-01-24 14:18:24 +01:00 · a11c617ade
parent 2fc9c89c9e
commit a11c617ade
2 changed files with 21 additions and 103 deletions
--- a/modules/chronos.py
+++ b/modules/chronos.py
@ -26,8 +26,10 @@ def get_courses(classe=None, room=None, teacher=None, date=None):
        url += "&teacher=" + quote(teacher)
    #TODO: date, not implemented at 23.tf

+    print_debug(url)
    response = web.getXML(url)
    if response is not None:
+        print_debug(response)
        return response.getNodes("course")
    else:
        return None
@ -55,6 +57,7 @@ def cmd_chronos(msg):
    res = Response(msg.sender, channel=msg.channel, nomore="Je n'ai pas d'autre cours à afficher")

    courses = get_courses(classe)
+    print_debug(courses)
    if courses is not None:
        now = datetime.now()
        tomorrow = now + timedelta(days=1)
--- a/tools/web.py
+++ b/tools/web.py
@ -21,91 +21,57 @@ import json
 import re
 import socket
 from urllib.parse import quote
+from urllib.parse import urlparse
+from urllib.request import urlopen

 import xmlparser

-# Parse URL
-
-def parseURL(url):
-    """Separate protocol, domain, port and page request"""
-    res = re.match("^([a-zA-Z0-9+.-]+):(//)?(.*)$", url)
-    if res is not None:
-        scheme = res.group(1)
-        if scheme == "https":
-            port = 443
-        elif scheme == "http":
-            port = 80
-        elif scheme == "ftp":
-            port = 21
-        elif scheme == "telnet":
-            port = 23
-        else:
-            port = 0
-        if res.group(2) == "//":
-            # Waiting Common Internet Scheme Syntax
-            ciss = re.match("^(([^:]+)(:([^@]+))@)?([^:/]+)(:([0-9]+))?(/.*)$", res.group(3))
-            user = ciss.group(2)
-            password = ciss.group(4)
-            host = ciss.group(5)
-            if ciss.group(7) is not None and ciss.group(7) != "" and int(ciss.group(7)) > 0:
-                port = int(ciss.group(7))
-            path = ciss.group(8)
-            return (scheme, (user, password), host, port, path)
-        else:
-            return (scheme, (None, None), None, None, res.group(3))
-    else:
-        return (None, (None, None), None, None, None)
-
 def isURL(url):
    """Return True if the URL can be parsed"""
-    (scheme, (user, password), host, port, path) = parseURL(url)
-    return scheme is not None
+    o = urlparse(url)
+    return o.scheme == "" and o.netloc == "" and o.path == ""

 def getScheme(url):
    """Return the protocol of a given URL"""
-    (scheme, (user, password), host, port, path) = parseURL(url)
-    return scheme
+    o = urlparse(url)
+    return o.scheme

 def getHost(url):
    """Return the domain of a given URL"""
-    (scheme, (user, password), host, port, path) = parseURL(url)
-    return host
+    return urlparse(url).netloc

 def getPort(url):
    """Return the port of a given URL"""
-    (scheme, (user, password), host, port, path) = parseURL(url)
-    return port
+    return urlparse(url).port

 def getPath(url):
    """Return the page request of a given URL"""
-    (scheme, (user, password), host, port, path) = parseURL(url)
-    return path
+    return urlparse(url).path

 def getUser(url):
    """Return the page request of a given URL"""
-    (scheme, (user, password), host, port, path) = parseURL(url)
-    return user
+    return urlparse(url).username
 def getPassword(url):
    """Return the page request of a given URL"""
-    (scheme, (user, password), host, port, path) = parseURL(url)
-    return password
+    return urlparse(url).password


 # Get real pages

 def getURLContent(url, timeout=15):
    """Return page content corresponding to URL or None if any error occurs"""
-    (scheme, (user, password), host, port, path) = parseURL(url)
-    if port is None:
-        return None
-    conn = http.client.HTTPConnection(host, port=port, timeout=timeout)
+    o = urlparse(url)
+    conn = http.client.HTTPConnection(o.netloc, port=o.port, timeout=timeout)
    try:
-        conn.request("GET", path, None, {"User-agent": "Nemubot v3"})
+        if o.query != '':
+            conn.request("GET", o.path + "?" + o.query, None, {"User-agent": "Nemubot v3"})
+        else:
+            conn.request("GET", o.path, None, {"User-agent": "Nemubot v3"})
    except socket.timeout:
        return None
    except socket.gaierror:
        print ("<tools.web> Unable to receive page %s from %s on %d."
-               % (path, host, port))
+               % (o.path, o.netloc, o.port))
        return None

    try:
@ -145,60 +111,9 @@ def getJSON(url, timeout=15):
    else:
        return json.loads(cnt.decode())

-def traceURL(url, timeout=5, stack=None):
-    """Follow redirections and return the redirections stack"""
-    if stack is None:
-        stack = list()
-    stack.append(url)
-
-    (scheme, (user, password), host, port, path) = parseURL(url)
-    if port is None or port == 0:
-        return stack
-    conn = http.client.HTTPConnection(host, port=port, timeout=timeout)
-    try:
-        conn.request("HEAD", path, None, {"User-agent": "Nemubot v3"})
-    except socket.timeout:
-        stack.append("Timeout")
-        return stack
-    except socket.gaierror:
-        print ("<tools.web> Unable to receive page %s from %s on %d."
-               % (path, host, port))
-        return None
-
-    try:
-        res = conn.getresponse()
-    except http.client.BadStatusLine:
-        return None
-    finally:
-        conn.close()
-
-    if res.status == http.client.OK:
-        return stack
-    elif res.status == http.client.FOUND or res.status == http.client.MOVED_PERMANENTLY or res.status == http.client.SEE_OTHER:
-        url = res.getheader("Location")
-        if url in stack:
-            stack.append(url)
-            return stack
-        else:
-            return traceURL(url, timeout, stack)
-    else:
-        return None
-
-
 # Other utils

 def striphtml(data):
    """Remove HTML tags from text"""
    p = re.compile(r'<.*?>')
    return p.sub('', data).replace("&#x28;", "/(").replace("&#x29;", ")/").replace("&#x22;", "\"")
-
-
-# Tests when called alone
-if __name__ == "__main__":
-    print(parseURL("http://www.nemunai.re/"))
-    print(parseURL("http://www.nemunai.re/?p0m"))
-    print(parseURL("http://www.nemunai.re/?p0m"))
-    print(parseURL("http://www.nemunai.re:42/?p0m"))
-    print(parseURL("ftp://www.nemunai.re:42/?p0m"))
-    print(parseURL("http://www.nemunai.re/?p0m"))
-    print(parseURL("magnet:ceciestunmagnet!"))