nemubot/modules/networking/page.py

132 lines
3.4 KiB
Python
Raw Normal View History

2014-12-01 17:13:58 +00:00
import http.client
import socket
import subprocess
import tempfile
import urllib
from nemubot import __version__
from nemubot.exception import IMException
from nemubot.tools import web
2014-12-01 17:13:58 +00:00
def load(CONF, add_hook):
2015-04-22 13:59:22 +00:00
# TODO: check w3m exists
2014-12-01 17:13:58 +00:00
pass
def headers(url):
"""Retrieve HTTP header for the given URL
Argument:
url -- the page URL to get header
"""
2015-10-13 22:17:02 +00:00
o = urllib.parse.urlparse(web.getNormalizedURL(url), "http")
2014-12-01 17:13:58 +00:00
if o.netloc == "":
raise IMException("invalid URL")
2014-12-01 17:13:58 +00:00
if o.scheme == "http":
2014-12-15 23:45:01 +00:00
conn = http.client.HTTPConnection(o.hostname, port=o.port, timeout=5)
2014-12-01 17:13:58 +00:00
else:
2014-12-15 23:45:01 +00:00
conn = http.client.HTTPSConnection(o.hostname, port=o.port, timeout=5)
2014-12-01 17:13:58 +00:00
try:
conn.request("HEAD", o.path, None, {"User-agent":
"Nemubot v%s" % __version__})
except ConnectionError as e:
raise IMException(e.strerror)
2014-12-01 17:13:58 +00:00
except socket.timeout:
raise IMException("request timeout")
2014-12-01 17:13:58 +00:00
except socket.gaierror:
print ("<tools.web> Unable to receive page %s from %s on %d."
% (o.path, o.hostname, o.port if o.port is not None else 0))
raise IMException("an unexpected error occurs")
2014-12-01 17:13:58 +00:00
try:
res = conn.getresponse()
except http.client.BadStatusLine:
raise IMException("An error occurs")
2014-12-01 17:13:58 +00:00
finally:
conn.close()
return (res.version, res.status, res.reason, res.getheaders())
def _onNoneDefault():
raise IMException("An error occurs when trying to access the page")
2014-12-01 17:13:58 +00:00
def fetch(url, onNone=_onNoneDefault):
"""Retrieve the content of the given URL
Argument:
url -- the URL to fetch
"""
try:
req = web.getURLContent(url)
if req is not None:
return req
else:
2015-10-12 04:28:52 +00:00
if callable(onNone):
2014-12-01 17:13:58 +00:00
return onNone()
else:
return None
except ConnectionError as e:
raise IMException(e.strerror)
2014-12-01 17:13:58 +00:00
except socket.timeout:
raise IMException("The request timeout when trying to access the page")
2014-12-01 17:13:58 +00:00
except socket.error as e:
raise IMException(e.strerror)
2014-12-01 17:13:58 +00:00
def _render(cnt):
"""Render the page contained in cnt as HTML page"""
if cnt is None:
return None
2014-12-01 17:13:58 +00:00
with tempfile.NamedTemporaryFile() as fp:
fp.write(cnt.encode())
args = ["w3m", "-T", "text/html", "-dump"]
args.append(fp.name)
with subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) as proc:
return proc.stdout.read().decode()
def render(url, onNone=_onNoneDefault):
"""Use w3m to render the given url
Argument:
url -- the URL to render
"""
return _render(fetch(url, onNone))
2014-12-01 17:13:58 +00:00
def traceURL(url, stack=None):
"""Follow redirections and return the redirections stack
Argument:
url -- the URL to trace
"""
if stack is None:
stack = list()
stack.append(url)
if len(stack) > 15:
stack.append('stack overflow :(')
return stack
_, status, _, heads = headers(url)
2014-12-01 17:13:58 +00:00
if status == http.client.FOUND or status == http.client.MOVED_PERMANENTLY or status == http.client.SEE_OTHER:
for h, c in heads:
2014-12-01 17:13:58 +00:00
if h == "Location":
url = c
if url in stack:
stack.append("loop on " + url)
return stack
else:
return traceURL(url, stack)
return stack