From aa697a36d08d6922b61d572ff2b03d930aec6ec7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=A9munaire?= Date: Tue, 6 Nov 2012 04:14:11 +0100 Subject: [PATCH] Now parse URL following RFC 1738 --- modules/ycc.py | 52 ++++++----------- tools/web.py | 147 ++++++++++++++++++++++++++++++++++++------------- 2 files changed, 126 insertions(+), 73 deletions(-) diff --git a/modules/ycc.py b/modules/ycc.py index 4c41d71..792775d 100644 --- a/modules/ycc.py +++ b/modules/ycc.py @@ -1,9 +1,8 @@ # coding=utf-8 -import http.client -import imp import re -import sys + +from tools import web nemubotversion = 3.3 @@ -40,9 +39,11 @@ def cmd_ycc(msg): if len(msg.cmds) < 6: res = list() for url in msg.cmds[1:]: - srv = re.match(".*((ht|f)tps?://|www.)([^/ ]+).*", url) + srv = web.getHost(url) if srv is not None: - res.append(gen_response(tinyfy(url), msg, srv.group(3))) + res.append(gen_response( + web.getURLContent("http://ycc.fr/redirection/create/" + + url).decode(), msg, srv)) else: res.append(gen_response(False, msg, url)) return res @@ -52,38 +53,19 @@ def cmd_ycc(msg): def parselisten(msg): global LAST_URLS - res = re.match(".*(((ht|f)tps?://|www\.)[^ ]+).*", msg.content) + res = re.match(".*([a-zA-Z0-9+.-]+):(//)?([^ ]*).*", msg.content) if res is not None: - if res.group(1).find("ycc.fr") >= 0: - return False - if msg.channel not in LAST_URLS: - LAST_URLS[msg.channel] = list() - LAST_URLS[msg.channel].append(res.group(1)) - return True + url = res.group(1) + srv = web.getHost(url) + if srv is not None: + if srv == "ycc.fr": + return False + if msg.channel not in LAST_URLS: + LAST_URLS[msg.channel] = list() + LAST_URLS[msg.channel].append(url) + return True return False + def parseresponse(res): parselisten(res) return True - -def tinyfy(url): - (status, page) = getPage("ycc.fr", "/redirection/create/" + url) - if status == http.client.OK and len(page) < 100: - return page.decode() - else: - print ("ERROR: ycc.fr seem down?") - return None - - -def getPage(s, p): - conn = http.client.HTTPConnection(s, timeout=10) - try: - conn.request("GET", p) - except socket.gaierror: - print ("[%s] impossible de récupérer la page %s."%(s, p)) - return None - - res = conn.getresponse() - data = res.read() - - conn.close() - return (res.status, data) diff --git a/tools/web.py b/tools/web.py index bec4bf8..8751d2f 100644 --- a/tools/web.py +++ b/tools/web.py @@ -27,68 +27,98 @@ import xmlparser def parseURL(url): """Separate protocol, domain, port and page request""" - res = re.match("^(([^:]+)://)?([^:/]+)(:([0-9]{1,5}))?(.*)$", url) + res = re.match("^([a-zA-Z0-9+.-]+):(//)?(.*)$", url) if res is not None: - if res.group(5) is not None: - port = int(res.group(5)) - elif res.group(2) is not None: - if res.group(2) == "http": - port = 80 - elif res.group(2) == "https": - port = 443 - else: - print (" WARNING: unknown protocol %s" - % res.group(2)) - port = 0 + scheme = res.group(1) + if scheme == "https": + port = 443 + elif scheme == "http": + port = 80 + elif scheme == "ftp": + port = 21 + elif scheme == "telnet": + port = 23 else: port = 0 - return (res.group(2), res.group(3), port, res.group(6)) + if res.group(2) == "//": + # Waiting Common Internet Scheme Syntax + ciss = re.match("^(([^:]+)(:([^@]+))@)?([^:/]+)(:([0-9]+))?(/.*)$", res.group(3)) + user = ciss.group(2) + password = ciss.group(4) + host = ciss.group(5) + if ciss.group(7) is not None and ciss.group(7) != "" and int(ciss.group(7)) > 0: + port = int(ciss.group(7)) + path = ciss.group(8) + return (scheme, (user, password), host, port, path) + else: + return (scheme, (None, None), None, None, res.group(3)) else: - return (None, None, None, None) + return (None, (None, None), None, None, None) -def getDomain(url): - """Return the domain of a given URL""" - (protocol, domain, port, page) = parseURL(url) - return domain +def isURL(url): + """Return True if the URL can be parsed""" + (scheme, (user, password), host, port, path) = parseURL(url) + return scheme is not None -def getProtocol(url): +def getScheme(url): """Return the protocol of a given URL""" - (protocol, domain, port, page) = parseURL(url) - return protocol + (scheme, (user, password), host, port, path) = parseURL(url) + return scheme + +def getHost(url): + """Return the domain of a given URL""" + (scheme, (user, password), host, port, path) = parseURL(url) + return host def getPort(url): """Return the port of a given URL""" - (protocol, domain, port, page) = parseURL(url) + (scheme, (user, password), host, port, path) = parseURL(url) return port -def getRequest(url): +def getPath(url): """Return the page request of a given URL""" - (protocol, domain, port, page) = parseURL(url) - return page + (scheme, (user, password), host, port, path) = parseURL(url) + return path + +def getUser(url): + """Return the page request of a given URL""" + (scheme, (user, password), host, port, path) = parseURL(url) + return user +def getPassword(url): + """Return the page request of a given URL""" + (scheme, (user, password), host, port, path) = parseURL(url) + return password # Get real pages def getURLContent(url, timeout=15): """Return page content corresponding to URL or None if any error occurs""" - (protocol, domain, port, page) = parseURL(url) - if port == 0: port = 80 - conn = http.client.HTTPConnection(domain, port=port, timeout=15) + (scheme, (user, password), host, port, path) = parseURL(url) + if port is None: + return None + conn = http.client.HTTPConnection(host, port=port, timeout=timeout) try: - conn.request("GET", page, None, {"User-agent": "Nemubot v3"}) + conn.request("GET", path, None, {"User-agent": "Nemubot v3"}) + except socket.timeout: + return None except socket.gaierror: print (" Unable to receive page %s from %s on %d." - % (page, domain, port)) + % (path, host, port)) return None - res = conn.getresponse() - data = res.read() - - conn.close() + try: + res = conn.getresponse() + data = res.read() + except http.client.BadStatusLine: + return None + finally: + conn.close() if res.status == http.client.OK or res.status == http.client.SEE_OTHER: return data - #TODO: follow redirections + elif res.status == http.client.FOUND or res.status == http.client.MOVED_PERMANENTLY: + return getURLContent(res.getheader("Location"), timeout) else: return None @@ -100,6 +130,46 @@ def getXML(url, timeout=15): else: return xmlparser.parse_string(cnt) +def traceURL(url, timeout=5, stack=None): + """Follow redirections and return the redirections stack""" + if stack is None: + stack = list() + stack.append(url) + + (scheme, (user, password), host, port, path) = parseURL(url) + if port is None or port == 0: + return stack + conn = http.client.HTTPConnection(host, port=port, timeout=timeout) + try: + conn.request("HEAD", path, None, {"User-agent": "Nemubot v3"}) + except socket.timeout: + stack.append("Timeout") + return stack + except socket.gaierror: + print (" Unable to receive page %s from %s on %d." + % (path, host, port)) + return None + + try: + res = conn.getresponse() + except http.client.BadStatusLine: + return None + finally: + conn.close() + + if res.status == http.client.OK: + return stack + elif res.status == http.client.FOUND or res.status == http.client.MOVED_PERMANENTLY or res.status == http.client.SEE_OTHER: + url = res.getheader("Location") + if url in stack: + stack.append(url) + return stack + else: + return traceURL(url, timeout, stack) + else: + return None + + # Other utils def striphtml(data): @@ -110,9 +180,10 @@ def striphtml(data): # Tests when called alone if __name__ == "__main__": - print(parseURL("www.nemunai.re")) - print(parseURL("www.nemunai.re/?p0m")) + print(parseURL("http://www.nemunai.re/")) + print(parseURL("http://www.nemunai.re/?p0m")) print(parseURL("http://www.nemunai.re/?p0m")) print(parseURL("http://www.nemunai.re:42/?p0m")) - print(parseURL("www.nemunai.re:42/?p0m")) + print(parseURL("ftp://www.nemunai.re:42/?p0m")) print(parseURL("http://www.nemunai.re/?p0m")) + print(parseURL("magnet:ceciestunmagnet!"))