Now parse URL following RFC 1738

This commit is contained in:
Némunaire 2012-11-06 04:14:11 +01:00
parent ca3559833a
commit aa697a36d0
2 changed files with 126 additions and 73 deletions

View file

@ -1,9 +1,8 @@
# coding=utf-8 # coding=utf-8
import http.client
import imp
import re import re
import sys
from tools import web
nemubotversion = 3.3 nemubotversion = 3.3
@ -40,9 +39,11 @@ def cmd_ycc(msg):
if len(msg.cmds) < 6: if len(msg.cmds) < 6:
res = list() res = list()
for url in msg.cmds[1:]: for url in msg.cmds[1:]:
srv = re.match(".*((ht|f)tps?://|www.)([^/ ]+).*", url) srv = web.getHost(url)
if srv is not None: if srv is not None:
res.append(gen_response(tinyfy(url), msg, srv.group(3))) res.append(gen_response(
web.getURLContent("http://ycc.fr/redirection/create/"
+ url).decode(), msg, srv))
else: else:
res.append(gen_response(False, msg, url)) res.append(gen_response(False, msg, url))
return res return res
@ -52,38 +53,19 @@ def cmd_ycc(msg):
def parselisten(msg): def parselisten(msg):
global LAST_URLS global LAST_URLS
res = re.match(".*(((ht|f)tps?://|www\.)[^ ]+).*", msg.content) res = re.match(".*([a-zA-Z0-9+.-]+):(//)?([^ ]*).*", msg.content)
if res is not None: if res is not None:
if res.group(1).find("ycc.fr") >= 0: url = res.group(1)
srv = web.getHost(url)
if srv is not None:
if srv == "ycc.fr":
return False return False
if msg.channel not in LAST_URLS: if msg.channel not in LAST_URLS:
LAST_URLS[msg.channel] = list() LAST_URLS[msg.channel] = list()
LAST_URLS[msg.channel].append(res.group(1)) LAST_URLS[msg.channel].append(url)
return True return True
return False return False
def parseresponse(res): def parseresponse(res):
parselisten(res) parselisten(res)
return True return True
def tinyfy(url):
(status, page) = getPage("ycc.fr", "/redirection/create/" + url)
if status == http.client.OK and len(page) < 100:
return page.decode()
else:
print ("ERROR: ycc.fr seem down?")
return None
def getPage(s, p):
conn = http.client.HTTPConnection(s, timeout=10)
try:
conn.request("GET", p)
except socket.gaierror:
print ("[%s] impossible de récupérer la page %s."%(s, p))
return None
res = conn.getresponse()
data = res.read()
conn.close()
return (res.status, data)

View file

@ -27,68 +27,98 @@ import xmlparser
def parseURL(url): def parseURL(url):
"""Separate protocol, domain, port and page request""" """Separate protocol, domain, port and page request"""
res = re.match("^(([^:]+)://)?([^:/]+)(:([0-9]{1,5}))?(.*)$", url) res = re.match("^([a-zA-Z0-9+.-]+):(//)?(.*)$", url)
if res is not None: if res is not None:
if res.group(5) is not None: scheme = res.group(1)
port = int(res.group(5)) if scheme == "https":
elif res.group(2) is not None:
if res.group(2) == "http":
port = 80
elif res.group(2) == "https":
port = 443 port = 443
else: elif scheme == "http":
print ("<tools.web> WARNING: unknown protocol %s" port = 80
% res.group(2)) elif scheme == "ftp":
port = 0 port = 21
elif scheme == "telnet":
port = 23
else: else:
port = 0 port = 0
return (res.group(2), res.group(3), port, res.group(6)) if res.group(2) == "//":
# Waiting Common Internet Scheme Syntax
ciss = re.match("^(([^:]+)(:([^@]+))@)?([^:/]+)(:([0-9]+))?(/.*)$", res.group(3))
user = ciss.group(2)
password = ciss.group(4)
host = ciss.group(5)
if ciss.group(7) is not None and ciss.group(7) != "" and int(ciss.group(7)) > 0:
port = int(ciss.group(7))
path = ciss.group(8)
return (scheme, (user, password), host, port, path)
else: else:
return (None, None, None, None) return (scheme, (None, None), None, None, res.group(3))
else:
return (None, (None, None), None, None, None)
def getDomain(url): def isURL(url):
"""Return the domain of a given URL""" """Return True if the URL can be parsed"""
(protocol, domain, port, page) = parseURL(url) (scheme, (user, password), host, port, path) = parseURL(url)
return domain return scheme is not None
def getProtocol(url): def getScheme(url):
"""Return the protocol of a given URL""" """Return the protocol of a given URL"""
(protocol, domain, port, page) = parseURL(url) (scheme, (user, password), host, port, path) = parseURL(url)
return protocol return scheme
def getHost(url):
"""Return the domain of a given URL"""
(scheme, (user, password), host, port, path) = parseURL(url)
return host
def getPort(url): def getPort(url):
"""Return the port of a given URL""" """Return the port of a given URL"""
(protocol, domain, port, page) = parseURL(url) (scheme, (user, password), host, port, path) = parseURL(url)
return port return port
def getRequest(url): def getPath(url):
"""Return the page request of a given URL""" """Return the page request of a given URL"""
(protocol, domain, port, page) = parseURL(url) (scheme, (user, password), host, port, path) = parseURL(url)
return page return path
def getUser(url):
"""Return the page request of a given URL"""
(scheme, (user, password), host, port, path) = parseURL(url)
return user
def getPassword(url):
"""Return the page request of a given URL"""
(scheme, (user, password), host, port, path) = parseURL(url)
return password
# Get real pages # Get real pages
def getURLContent(url, timeout=15): def getURLContent(url, timeout=15):
"""Return page content corresponding to URL or None if any error occurs""" """Return page content corresponding to URL or None if any error occurs"""
(protocol, domain, port, page) = parseURL(url) (scheme, (user, password), host, port, path) = parseURL(url)
if port == 0: port = 80 if port is None:
conn = http.client.HTTPConnection(domain, port=port, timeout=15) return None
conn = http.client.HTTPConnection(host, port=port, timeout=timeout)
try: try:
conn.request("GET", page, None, {"User-agent": "Nemubot v3"}) conn.request("GET", path, None, {"User-agent": "Nemubot v3"})
except socket.timeout:
return None
except socket.gaierror: except socket.gaierror:
print ("<tools.web> Unable to receive page %s from %s on %d." print ("<tools.web> Unable to receive page %s from %s on %d."
% (page, domain, port)) % (path, host, port))
return None return None
try:
res = conn.getresponse() res = conn.getresponse()
data = res.read() data = res.read()
except http.client.BadStatusLine:
return None
finally:
conn.close() conn.close()
if res.status == http.client.OK or res.status == http.client.SEE_OTHER: if res.status == http.client.OK or res.status == http.client.SEE_OTHER:
return data return data
#TODO: follow redirections elif res.status == http.client.FOUND or res.status == http.client.MOVED_PERMANENTLY:
return getURLContent(res.getheader("Location"), timeout)
else: else:
return None return None
@ -100,6 +130,46 @@ def getXML(url, timeout=15):
else: else:
return xmlparser.parse_string(cnt) return xmlparser.parse_string(cnt)
def traceURL(url, timeout=5, stack=None):
"""Follow redirections and return the redirections stack"""
if stack is None:
stack = list()
stack.append(url)
(scheme, (user, password), host, port, path) = parseURL(url)
if port is None or port == 0:
return stack
conn = http.client.HTTPConnection(host, port=port, timeout=timeout)
try:
conn.request("HEAD", path, None, {"User-agent": "Nemubot v3"})
except socket.timeout:
stack.append("Timeout")
return stack
except socket.gaierror:
print ("<tools.web> Unable to receive page %s from %s on %d."
% (path, host, port))
return None
try:
res = conn.getresponse()
except http.client.BadStatusLine:
return None
finally:
conn.close()
if res.status == http.client.OK:
return stack
elif res.status == http.client.FOUND or res.status == http.client.MOVED_PERMANENTLY or res.status == http.client.SEE_OTHER:
url = res.getheader("Location")
if url in stack:
stack.append(url)
return stack
else:
return traceURL(url, timeout, stack)
else:
return None
# Other utils # Other utils
def striphtml(data): def striphtml(data):
@ -110,9 +180,10 @@ def striphtml(data):
# Tests when called alone # Tests when called alone
if __name__ == "__main__": if __name__ == "__main__":
print(parseURL("www.nemunai.re")) print(parseURL("http://www.nemunai.re/"))
print(parseURL("www.nemunai.re/?p0m")) print(parseURL("http://www.nemunai.re/?p0m"))
print(parseURL("http://www.nemunai.re/?p0m")) print(parseURL("http://www.nemunai.re/?p0m"))
print(parseURL("http://www.nemunai.re:42/?p0m")) print(parseURL("http://www.nemunai.re:42/?p0m"))
print(parseURL("www.nemunai.re:42/?p0m")) print(parseURL("ftp://www.nemunai.re:42/?p0m"))
print(parseURL("http://www.nemunai.re/?p0m")) print(parseURL("http://www.nemunai.re/?p0m"))
print(parseURL("magnet:ceciestunmagnet!"))