nemubot/modules/networking/page.py

import http.client
import socket
import subprocess
import tempfile
import urllib

from nemubot import __version__
from nemubot.exception import IMException
from nemubot.tools import web


def load(CONF, add_hook):
    # TODO: check w3m exists
    pass


def headers(url):
    """Retrieve HTTP header for the given URL

    Argument:
    url -- the page URL to get header
    """

    o = urllib.parse.urlparse(web.getNormalizedURL(url), "http")
    if o.netloc == "":
        raise IMException("invalid URL")
    if o.scheme == "http":
        conn = http.client.HTTPConnection(o.hostname, port=o.port, timeout=5)
    else:
        conn = http.client.HTTPSConnection(o.hostname, port=o.port, timeout=5)
    try:
        conn.request("HEAD", o.path, None, {"User-agent":
                                            "Nemubot v%s" % __version__})
    except ConnectionError as e:
        raise IMException(e.strerror)
    except socket.timeout:
        raise IMException("request timeout")
    except socket.gaierror:
        print ("<tools.web> Unable to receive page %s from %s on %d."
               % (o.path, o.hostname, o.port if o.port is not None else 0))
        raise IMException("an unexpected error occurs")

    try:
        res = conn.getresponse()
    except http.client.BadStatusLine:
        raise IMException("An error occurs")
    finally:
        conn.close()

    return (res.version, res.status, res.reason, res.getheaders())


def _onNoneDefault():
    raise IMException("An error occurs when trying to access the page")


def fetch(url, onNone=_onNoneDefault):
    """Retrieve the content of the given URL

    Argument:
    url -- the URL to fetch
    """

    try:
        req = web.getURLContent(url)
        if req is not None:
            return req
        else:
            if callable(onNone):
                return onNone()
            else:
                return None
    except ConnectionError as e:
        raise IMException(e.strerror)
    except socket.timeout:
        raise IMException("The request timeout when trying to access the page")
    except socket.error as e:
        raise IMException(e.strerror)


def _render(cnt):
    """Render the page contained in cnt as HTML page"""
    if cnt is None:
        return None

    with tempfile.NamedTemporaryFile() as fp:
        fp.write(cnt.encode())

        args = ["w3m", "-T", "text/html", "-dump"]
        args.append(fp.name)
        with subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) as proc:
            return proc.stdout.read().decode()


def render(url, onNone=_onNoneDefault):
    """Use w3m to render the given url

    Argument:
    url -- the URL to render
    """

    return _render(fetch(url, onNone))


def traceURL(url, stack=None):
    """Follow redirections and return the redirections stack

    Argument:
    url -- the URL to trace
    """

    if stack is None:
        stack = list()
    stack.append(url)

    if len(stack) > 15:
        stack.append('stack overflow :(')
        return stack

    _, status, _, heads = headers(url)

    if status == http.client.FOUND or status == http.client.MOVED_PERMANENTLY or status == http.client.SEE_OTHER:
        for h, c in heads:
            if h == "Location":
                url = c
                if url in stack:
                    stack.append("loop on " + url)
                    return stack
                else:
                    return traceURL(url, stack)
    return stack
[networking] Refactor module 2014-12-01 17:13:58 +00:00			`import http.client`
			`import socket`
			`import subprocess`
			`import tempfile`
			`import urllib`

[wip] in modules, changes import to reflect new directory structure 2015-01-03 19:34:44 +00:00			`from nemubot import __version__`
Replace IRCException by IMException, as nemubot is not only built for IRC 2015-10-30 20:57:45 +00:00			`from nemubot.exception import IMException`
[wip] in modules, changes import to reflect new directory structure 2015-01-03 19:34:44 +00:00			`from nemubot.tools import web`
[networking] Refactor module 2014-12-01 17:13:58 +00:00

			`def load(CONF, add_hook):`
Update TODO item 2015-04-22 13:59:22 +00:00			`# TODO: check w3m exists`
[networking] Refactor module 2014-12-01 17:13:58 +00:00			`pass`


			`def headers(url):`
			`"""Retrieve HTTP header for the given URL`

			`Argument:`
			`url -- the page URL to get header`
			`"""`

tools/web: factorize getNormalizedURL 2015-10-13 22:17:02 +00:00			`o = urllib.parse.urlparse(web.getNormalizedURL(url), "http")`
[networking] Refactor module 2014-12-01 17:13:58 +00:00			`if o.netloc == "":`
Replace IRCException by IMException, as nemubot is not only built for IRC 2015-10-30 20:57:45 +00:00			`raise IMException("invalid URL")`
[networking] Refactor module 2014-12-01 17:13:58 +00:00			`if o.scheme == "http":`
fix netloc != hostname 2014-12-15 23:45:01 +00:00			`conn = http.client.HTTPConnection(o.hostname, port=o.port, timeout=5)`
[networking] Refactor module 2014-12-01 17:13:58 +00:00			`else:`
fix netloc != hostname 2014-12-15 23:45:01 +00:00			`conn = http.client.HTTPSConnection(o.hostname, port=o.port, timeout=5)`
[networking] Refactor module 2014-12-01 17:13:58 +00:00			`try:`
Indicate full version in UserAgent HTTP header 2015-01-01 20:46:57 +00:00			`conn.request("HEAD", o.path, None, {"User-agent":`
			`"Nemubot v%s" % __version__})`
[networking] Handle ConnectionError exceptions 2015-07-03 00:31:27 +00:00			`except ConnectionError as e:`
Replace IRCException by IMException, as nemubot is not only built for IRC 2015-10-30 20:57:45 +00:00			`raise IMException(e.strerror)`
[networking] Refactor module 2014-12-01 17:13:58 +00:00			`except socket.timeout:`
Replace IRCException by IMException, as nemubot is not only built for IRC 2015-10-30 20:57:45 +00:00			`raise IMException("request timeout")`
[networking] Refactor module 2014-12-01 17:13:58 +00:00			`except socket.gaierror:`
			`print ("<tools.web> Unable to receive page %s from %s on %d."`
[networking] Avoid exception when port is not defined on socket error 2015-09-01 18:29:11 +00:00			`% (o.path, o.hostname, o.port if o.port is not None else 0))`
Replace IRCException by IMException, as nemubot is not only built for IRC 2015-10-30 20:57:45 +00:00			`raise IMException("an unexpected error occurs")`
[networking] Refactor module 2014-12-01 17:13:58 +00:00
			`try:`
			`res = conn.getresponse()`
			`except http.client.BadStatusLine:`
Replace IRCException by IMException, as nemubot is not only built for IRC 2015-10-30 20:57:45 +00:00			`raise IMException("An error occurs")`
[networking] Refactor module 2014-12-01 17:13:58 +00:00			`finally:`
			`conn.close()`

			`return (res.version, res.status, res.reason, res.getheaders())`


			`def _onNoneDefault():`
Replace IRCException by IMException, as nemubot is not only built for IRC 2015-10-30 20:57:45 +00:00			`raise IMException("An error occurs when trying to access the page")`
[networking] Refactor module 2014-12-01 17:13:58 +00:00

			`def fetch(url, onNone=_onNoneDefault):`
			`"""Retrieve the content of the given URL`

			`Argument:`
			`url -- the URL to fetch`
			`"""`

			`try:`
			`req = web.getURLContent(url)`
			`if req is not None:`
			`return req`
			`else:`
[networking] Dusting module 2015-10-12 04:28:52 +00:00			`if callable(onNone):`
[networking] Refactor module 2014-12-01 17:13:58 +00:00			`return onNone()`
			`else:`
			`return None`
[networking] Handle ConnectionError exceptions 2015-07-03 00:31:27 +00:00			`except ConnectionError as e:`
Replace IRCException by IMException, as nemubot is not only built for IRC 2015-10-30 20:57:45 +00:00			`raise IMException(e.strerror)`
[networking] Refactor module 2014-12-01 17:13:58 +00:00			`except socket.timeout:`
Replace IRCException by IMException, as nemubot is not only built for IRC 2015-10-30 20:57:45 +00:00			`raise IMException("The request timeout when trying to access the page")`
[networking] Refactor module 2014-12-01 17:13:58 +00:00			`except socket.error as e:`
Replace IRCException by IMException, as nemubot is not only built for IRC 2015-10-30 20:57:45 +00:00			`raise IMException(e.strerror)`
[networking] Refactor module 2014-12-01 17:13:58 +00:00

[networking] fix watch pages that aren't text/html 2015-07-01 16:15:35 +00:00			`def _render(cnt):`
			`"""Render the page contained in cnt as HTML page"""`
			`if cnt is None:`
			`return None`
[networking] Refactor module 2014-12-01 17:13:58 +00:00
			`with tempfile.NamedTemporaryFile() as fp:`
			`fp.write(cnt.encode())`

			`args = ["w3m", "-T", "text/html", "-dump"]`
			`args.append(fp.name)`
			`with subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) as proc:`
			`return proc.stdout.read().decode()`


[networking] fix watch pages that aren't text/html 2015-07-01 16:15:35 +00:00			`def render(url, onNone=_onNoneDefault):`
			`"""Use w3m to render the given url`

			`Argument:`
			`url -- the URL to render`
			`"""`

			`return _render(fetch(url, onNone))`


[networking] Refactor module 2014-12-01 17:13:58 +00:00			`def traceURL(url, stack=None):`
			`"""Follow redirections and return the redirections stack`

			`Argument:`
			`url -- the URL to trace`
			`"""`

			`if stack is None:`
			`stack = list()`
			`stack.append(url)`

			`if len(stack) > 15:`
			`stack.append('stack overflow :(')`
			`return stack`

[networking] Fix variable name conflict 2015-02-12 22:55:47 +00:00			`_, status, _, heads = headers(url)`
[networking] Refactor module 2014-12-01 17:13:58 +00:00
			`if status == http.client.FOUND or status == http.client.MOVED_PERMANENTLY or status == http.client.SEE_OTHER:`
[networking] Fix variable name conflict 2015-02-12 22:55:47 +00:00			`for h, c in heads:`
[networking] Refactor module 2014-12-01 17:13:58 +00:00			`if h == "Location":`
			`url = c`
			`if url in stack:`
			`stack.append("loop on " + url)`
			`return stack`
			`else:`
			`return traceURL(url, stack)`
			`return stack`