Now parse URL following RFC 1738
This commit is contained in:
parent
ca3559833a
commit
aa697a36d0
2 changed files with 126 additions and 73 deletions
|
@ -1,9 +1,8 @@
|
||||||
# coding=utf-8
|
# coding=utf-8
|
||||||
|
|
||||||
import http.client
|
|
||||||
import imp
|
|
||||||
import re
|
import re
|
||||||
import sys
|
|
||||||
|
from tools import web
|
||||||
|
|
||||||
nemubotversion = 3.3
|
nemubotversion = 3.3
|
||||||
|
|
||||||
|
@ -40,9 +39,11 @@ def cmd_ycc(msg):
|
||||||
if len(msg.cmds) < 6:
|
if len(msg.cmds) < 6:
|
||||||
res = list()
|
res = list()
|
||||||
for url in msg.cmds[1:]:
|
for url in msg.cmds[1:]:
|
||||||
srv = re.match(".*((ht|f)tps?://|www.)([^/ ]+).*", url)
|
srv = web.getHost(url)
|
||||||
if srv is not None:
|
if srv is not None:
|
||||||
res.append(gen_response(tinyfy(url), msg, srv.group(3)))
|
res.append(gen_response(
|
||||||
|
web.getURLContent("http://ycc.fr/redirection/create/"
|
||||||
|
+ url).decode(), msg, srv))
|
||||||
else:
|
else:
|
||||||
res.append(gen_response(False, msg, url))
|
res.append(gen_response(False, msg, url))
|
||||||
return res
|
return res
|
||||||
|
@ -52,38 +53,19 @@ def cmd_ycc(msg):
|
||||||
|
|
||||||
def parselisten(msg):
|
def parselisten(msg):
|
||||||
global LAST_URLS
|
global LAST_URLS
|
||||||
res = re.match(".*(((ht|f)tps?://|www\.)[^ ]+).*", msg.content)
|
res = re.match(".*([a-zA-Z0-9+.-]+):(//)?([^ ]*).*", msg.content)
|
||||||
if res is not None:
|
if res is not None:
|
||||||
if res.group(1).find("ycc.fr") >= 0:
|
url = res.group(1)
|
||||||
|
srv = web.getHost(url)
|
||||||
|
if srv is not None:
|
||||||
|
if srv == "ycc.fr":
|
||||||
return False
|
return False
|
||||||
if msg.channel not in LAST_URLS:
|
if msg.channel not in LAST_URLS:
|
||||||
LAST_URLS[msg.channel] = list()
|
LAST_URLS[msg.channel] = list()
|
||||||
LAST_URLS[msg.channel].append(res.group(1))
|
LAST_URLS[msg.channel].append(url)
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def parseresponse(res):
|
def parseresponse(res):
|
||||||
parselisten(res)
|
parselisten(res)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def tinyfy(url):
|
|
||||||
(status, page) = getPage("ycc.fr", "/redirection/create/" + url)
|
|
||||||
if status == http.client.OK and len(page) < 100:
|
|
||||||
return page.decode()
|
|
||||||
else:
|
|
||||||
print ("ERROR: ycc.fr seem down?")
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def getPage(s, p):
|
|
||||||
conn = http.client.HTTPConnection(s, timeout=10)
|
|
||||||
try:
|
|
||||||
conn.request("GET", p)
|
|
||||||
except socket.gaierror:
|
|
||||||
print ("[%s] impossible de récupérer la page %s."%(s, p))
|
|
||||||
return None
|
|
||||||
|
|
||||||
res = conn.getresponse()
|
|
||||||
data = res.read()
|
|
||||||
|
|
||||||
conn.close()
|
|
||||||
return (res.status, data)
|
|
||||||
|
|
139
tools/web.py
139
tools/web.py
|
@ -27,68 +27,98 @@ import xmlparser
|
||||||
|
|
||||||
def parseURL(url):
|
def parseURL(url):
|
||||||
"""Separate protocol, domain, port and page request"""
|
"""Separate protocol, domain, port and page request"""
|
||||||
res = re.match("^(([^:]+)://)?([^:/]+)(:([0-9]{1,5}))?(.*)$", url)
|
res = re.match("^([a-zA-Z0-9+.-]+):(//)?(.*)$", url)
|
||||||
if res is not None:
|
if res is not None:
|
||||||
if res.group(5) is not None:
|
scheme = res.group(1)
|
||||||
port = int(res.group(5))
|
if scheme == "https":
|
||||||
elif res.group(2) is not None:
|
|
||||||
if res.group(2) == "http":
|
|
||||||
port = 80
|
|
||||||
elif res.group(2) == "https":
|
|
||||||
port = 443
|
port = 443
|
||||||
else:
|
elif scheme == "http":
|
||||||
print ("<tools.web> WARNING: unknown protocol %s"
|
port = 80
|
||||||
% res.group(2))
|
elif scheme == "ftp":
|
||||||
port = 0
|
port = 21
|
||||||
|
elif scheme == "telnet":
|
||||||
|
port = 23
|
||||||
else:
|
else:
|
||||||
port = 0
|
port = 0
|
||||||
return (res.group(2), res.group(3), port, res.group(6))
|
if res.group(2) == "//":
|
||||||
|
# Waiting Common Internet Scheme Syntax
|
||||||
|
ciss = re.match("^(([^:]+)(:([^@]+))@)?([^:/]+)(:([0-9]+))?(/.*)$", res.group(3))
|
||||||
|
user = ciss.group(2)
|
||||||
|
password = ciss.group(4)
|
||||||
|
host = ciss.group(5)
|
||||||
|
if ciss.group(7) is not None and ciss.group(7) != "" and int(ciss.group(7)) > 0:
|
||||||
|
port = int(ciss.group(7))
|
||||||
|
path = ciss.group(8)
|
||||||
|
return (scheme, (user, password), host, port, path)
|
||||||
else:
|
else:
|
||||||
return (None, None, None, None)
|
return (scheme, (None, None), None, None, res.group(3))
|
||||||
|
else:
|
||||||
|
return (None, (None, None), None, None, None)
|
||||||
|
|
||||||
def getDomain(url):
|
def isURL(url):
|
||||||
"""Return the domain of a given URL"""
|
"""Return True if the URL can be parsed"""
|
||||||
(protocol, domain, port, page) = parseURL(url)
|
(scheme, (user, password), host, port, path) = parseURL(url)
|
||||||
return domain
|
return scheme is not None
|
||||||
|
|
||||||
def getProtocol(url):
|
def getScheme(url):
|
||||||
"""Return the protocol of a given URL"""
|
"""Return the protocol of a given URL"""
|
||||||
(protocol, domain, port, page) = parseURL(url)
|
(scheme, (user, password), host, port, path) = parseURL(url)
|
||||||
return protocol
|
return scheme
|
||||||
|
|
||||||
|
def getHost(url):
|
||||||
|
"""Return the domain of a given URL"""
|
||||||
|
(scheme, (user, password), host, port, path) = parseURL(url)
|
||||||
|
return host
|
||||||
|
|
||||||
def getPort(url):
|
def getPort(url):
|
||||||
"""Return the port of a given URL"""
|
"""Return the port of a given URL"""
|
||||||
(protocol, domain, port, page) = parseURL(url)
|
(scheme, (user, password), host, port, path) = parseURL(url)
|
||||||
return port
|
return port
|
||||||
|
|
||||||
def getRequest(url):
|
def getPath(url):
|
||||||
"""Return the page request of a given URL"""
|
"""Return the page request of a given URL"""
|
||||||
(protocol, domain, port, page) = parseURL(url)
|
(scheme, (user, password), host, port, path) = parseURL(url)
|
||||||
return page
|
return path
|
||||||
|
|
||||||
|
def getUser(url):
|
||||||
|
"""Return the page request of a given URL"""
|
||||||
|
(scheme, (user, password), host, port, path) = parseURL(url)
|
||||||
|
return user
|
||||||
|
def getPassword(url):
|
||||||
|
"""Return the page request of a given URL"""
|
||||||
|
(scheme, (user, password), host, port, path) = parseURL(url)
|
||||||
|
return password
|
||||||
|
|
||||||
|
|
||||||
# Get real pages
|
# Get real pages
|
||||||
|
|
||||||
def getURLContent(url, timeout=15):
|
def getURLContent(url, timeout=15):
|
||||||
"""Return page content corresponding to URL or None if any error occurs"""
|
"""Return page content corresponding to URL or None if any error occurs"""
|
||||||
(protocol, domain, port, page) = parseURL(url)
|
(scheme, (user, password), host, port, path) = parseURL(url)
|
||||||
if port == 0: port = 80
|
if port is None:
|
||||||
conn = http.client.HTTPConnection(domain, port=port, timeout=15)
|
return None
|
||||||
|
conn = http.client.HTTPConnection(host, port=port, timeout=timeout)
|
||||||
try:
|
try:
|
||||||
conn.request("GET", page, None, {"User-agent": "Nemubot v3"})
|
conn.request("GET", path, None, {"User-agent": "Nemubot v3"})
|
||||||
|
except socket.timeout:
|
||||||
|
return None
|
||||||
except socket.gaierror:
|
except socket.gaierror:
|
||||||
print ("<tools.web> Unable to receive page %s from %s on %d."
|
print ("<tools.web> Unable to receive page %s from %s on %d."
|
||||||
% (page, domain, port))
|
% (path, host, port))
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
res = conn.getresponse()
|
res = conn.getresponse()
|
||||||
data = res.read()
|
data = res.read()
|
||||||
|
except http.client.BadStatusLine:
|
||||||
|
return None
|
||||||
|
finally:
|
||||||
conn.close()
|
conn.close()
|
||||||
|
|
||||||
if res.status == http.client.OK or res.status == http.client.SEE_OTHER:
|
if res.status == http.client.OK or res.status == http.client.SEE_OTHER:
|
||||||
return data
|
return data
|
||||||
#TODO: follow redirections
|
elif res.status == http.client.FOUND or res.status == http.client.MOVED_PERMANENTLY:
|
||||||
|
return getURLContent(res.getheader("Location"), timeout)
|
||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
@ -100,6 +130,46 @@ def getXML(url, timeout=15):
|
||||||
else:
|
else:
|
||||||
return xmlparser.parse_string(cnt)
|
return xmlparser.parse_string(cnt)
|
||||||
|
|
||||||
|
def traceURL(url, timeout=5, stack=None):
|
||||||
|
"""Follow redirections and return the redirections stack"""
|
||||||
|
if stack is None:
|
||||||
|
stack = list()
|
||||||
|
stack.append(url)
|
||||||
|
|
||||||
|
(scheme, (user, password), host, port, path) = parseURL(url)
|
||||||
|
if port is None or port == 0:
|
||||||
|
return stack
|
||||||
|
conn = http.client.HTTPConnection(host, port=port, timeout=timeout)
|
||||||
|
try:
|
||||||
|
conn.request("HEAD", path, None, {"User-agent": "Nemubot v3"})
|
||||||
|
except socket.timeout:
|
||||||
|
stack.append("Timeout")
|
||||||
|
return stack
|
||||||
|
except socket.gaierror:
|
||||||
|
print ("<tools.web> Unable to receive page %s from %s on %d."
|
||||||
|
% (path, host, port))
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
res = conn.getresponse()
|
||||||
|
except http.client.BadStatusLine:
|
||||||
|
return None
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
if res.status == http.client.OK:
|
||||||
|
return stack
|
||||||
|
elif res.status == http.client.FOUND or res.status == http.client.MOVED_PERMANENTLY or res.status == http.client.SEE_OTHER:
|
||||||
|
url = res.getheader("Location")
|
||||||
|
if url in stack:
|
||||||
|
stack.append(url)
|
||||||
|
return stack
|
||||||
|
else:
|
||||||
|
return traceURL(url, timeout, stack)
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
# Other utils
|
# Other utils
|
||||||
|
|
||||||
def striphtml(data):
|
def striphtml(data):
|
||||||
|
@ -110,9 +180,10 @@ def striphtml(data):
|
||||||
|
|
||||||
# Tests when called alone
|
# Tests when called alone
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
print(parseURL("www.nemunai.re"))
|
print(parseURL("http://www.nemunai.re/"))
|
||||||
print(parseURL("www.nemunai.re/?p0m"))
|
print(parseURL("http://www.nemunai.re/?p0m"))
|
||||||
print(parseURL("http://www.nemunai.re/?p0m"))
|
print(parseURL("http://www.nemunai.re/?p0m"))
|
||||||
print(parseURL("http://www.nemunai.re:42/?p0m"))
|
print(parseURL("http://www.nemunai.re:42/?p0m"))
|
||||||
print(parseURL("www.nemunai.re:42/?p0m"))
|
print(parseURL("ftp://www.nemunai.re:42/?p0m"))
|
||||||
print(parseURL("http://www.nemunai.re/?p0m"))
|
print(parseURL("http://www.nemunai.re/?p0m"))
|
||||||
|
print(parseURL("magnet:ceciestunmagnet!"))
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue