nemubot/modules/mediawiki.py

# coding=utf-8

"""Use MediaWiki API to get pages"""

import json
import re
import urllib.parse

from nemubot.exception import IRCException
from nemubot.hooks import hook
from nemubot.tools import web

nemubotversion = 3.4

from more import Response


# MEDIAWIKI REQUESTS ##################################################

def get_namespaces(site, ssl=False):
    # Built URL
    url = "http%s://%s/w/api.php?format=json&action=query&meta=siteinfo&siprop=namespaces" % (
        "s" if ssl else "", site)

    # Make the request
    data = web.getJSON(url)

    namespaces = dict()
    for ns in data["query"]["namespaces"]:
        namespaces[data["query"]["namespaces"][ns]["*"]] = data["query"]["namespaces"][ns]
    return namespaces


def get_raw_page(site, term, ssl=False):
    # Built URL
    url = "http%s://%s/w/api.php?format=json&redirects&action=query&prop=revisions&rvprop=content&titles=%s" % (
        "s" if ssl else "", site, urllib.parse.quote(term))

    # Make the request
    data = web.getJSON(url)

    for k in data["query"]["pages"]:
        try:
            return data["query"]["pages"][k]["revisions"][0]["*"]
        except:
            raise IRCException("article not found")


def get_unwikitextified(site, wikitext, ssl=False):
    # Built URL
    url = "http%s://%s/w/api.php?format=json&action=expandtemplates&text=%s" % (
        "s" if ssl else "", site, urllib.parse.quote(wikitext))

    # Make the request
    data = web.getJSON(url)

    return data["expandtemplates"]["*"]


## Search

def opensearch(site, term, ssl=False):
    # Built URL
    url = "http%s://%s/w/api.php?format=xml&action=opensearch&search=%s" % (
        "s" if ssl else "", site, urllib.parse.quote(term))

    # Make the request
    response = web.getXML(url)

    if response is not None and response.hasNode("Section"):
        for itm in response.getNode("Section").getNodes("Item"):
            yield (itm.getNode("Text").getContent(),
                   itm.getNode("Description").getContent(),
                   itm.getNode("Url").getContent())


def search(site, term, ssl=False):
    # Built URL
    url = "http%s://%s/w/api.php?format=json&action=query&list=search&srsearch=%s&srprop=titlesnippet|snippet" % (
        "s" if ssl else "", site, urllib.parse.quote(term))

    # Make the request
    data = web.getJSON(url)

    if data is not None and "query" in data and "search" in data["query"]:
        for itm in data["query"]["search"]:
            yield (web.striphtml(itm["titlesnippet"].replace("<span class='searchmatch'>", "\x03\x02").replace("</span>", "\x03\x02")),
                   web.striphtml(itm["snippet"].replace("<span class='searchmatch'>", "\x03\x02").replace("</span>", "\x03\x02")))


# PARSING FUNCTIONS ###################################################

def strip_model(cnt):
    # Strip models at begin: mostly useless
    cnt = re.sub(r"^(({{([^{]|\s|({{([^{]|\s|{{.*?}})*?}})*?)*?}}|\[\[([^[]|\s|\[\[.*?\]\])*?\]\])\s*)+", "", cnt, flags=re.DOTALL)

    # Remove new line from models
    for full in re.findall(r"{{.*?}}", cnt, flags=re.DOTALL):
        cnt = cnt.replace(full, full.replace("\n", " "), 1)

    # Remove new line after titles
    cnt, _ = re.subn(r"((?P<title>==+)\s*(.*?)\s*(?P=title))\n+", r"\1", cnt)

    # Strip HTML comments
    cnt = re.sub(r"<!--.*?-->", "", cnt, flags=re.DOTALL)

    # Strip ref
    cnt = re.sub(r"<ref.*?/ref>", "", cnt, flags=re.DOTALL)
    return cnt


def parse_wikitext(site, cnt, namespaces=dict(), ssl=False):
    for i, _, _, _ in re.findall(r"({{([^{]|\s|({{(.|\s|{{.*?}})*?}})*?)*?}})", cnt):
        cnt = cnt.replace(i, get_unwikitextified(site, i, ssl), 1)

    # Strip [[...]]
    for full, args, lnk in re.findall(r"(\[\[(.*?|)?([^|]*?)\]\])", cnt):
        ns = lnk.find(":")
        if lnk == "":
            cnt = cnt.replace(full, args[:-1], 1)
        elif ns > 0:
            namespace = lnk[:ns]
            if namespace in namespaces and namespaces[namespace]["canonical"] == "Category":
                cnt = cnt.replace(full, "", 1)
                continue
            cnt = cnt.replace(full, lnk, 1)
        else:
            cnt = cnt.replace(full, lnk, 1)

    # Strip HTML tags
    cnt = web.striphtml(cnt)

    return cnt


# FORMATING FUNCTIONS #################################################

def irc_format(cnt):
    cnt, _ = re.subn(r"(?P<title>==+)\s*(.*?)\s*(?P=title)", "\x03\x16" + r"\2" + " :\x03\x16 ", cnt)
    return cnt.replace("'''", "\x03\x02").replace("''", "\x03\x1f")


def get_page(site, term, ssl=False, subpart=None):
    raw = get_raw_page(site, term, ssl)

    if subpart is not None:
        subpart = subpart.replace("_", " ")
        raw = re.sub(r"^.*(?P<title>==+)\s*(" + subpart + r")\s*(?P=title)", r"\1 \2 \1", raw, flags=re.DOTALL)

    return strip_model(raw)


# NEMUBOT #############################################################

def mediawiki_response(site, term, receivers):
    ns = get_namespaces(site)

    terms = term.split("#", 1)

    try:
        # Print the article if it exists
        return Response(get_page(site, terms[0], subpart=terms[1] if len(terms) > 1 else None),
                        line_treat=lambda line: irc_format(parse_wikitext(site, line, ns)),
                        channel=receivers)
    except:
        # Try looking at opensearch
        os = [x for x, _, _ in opensearch(site, terms[0])]
        # Fallback to global search
        if not len(os):
            os = [x for x, _ in search(site, terms[0]) if x is not None and x != ""]
        return Response(os,
                        channel=receivers,
                        title="Article not found, would you mean")


@hook("cmd_hook", "mediawiki")
def cmd_mediawiki(msg):
    """Read an article on a MediaWiki"""
    if len(msg.args) < 2:
        raise IRCException("indicate a domain and a term to search")

    return mediawiki_response(msg.args[0],
                              " ".join(msg.args[1:]),
                              msg.receivers)


@hook("cmd_hook", "search_mediawiki")
def cmd_srchmediawiki(msg):
    """Search an article on a MediaWiki"""
    if len(msg.args) < 2:
        raise IRCException("indicate a domain and a term to search")

    res = Response(channel=msg.receivers, nomore="No more results", count=" (%d more results)")

    for r in search(msg.args[0], " ".join(msg.args[1:])):
        res.append_message("%s: %s" % r)

    return res


@hook("cmd_hook", "wikipedia")
def cmd_wikipedia(msg):
    if len(msg.args) < 2:
        raise IRCException("indicate a lang and a term to search")

    return mediawiki_response(msg.args[0] + ".wikipedia.org",
                              " ".join(msg.args[1:]),
                              msg.receivers)
New module mediawiki 2014-09-02 19:19:08 +00:00			`# coding=utf-8`

			`"""Use MediaWiki API to get pages"""`

			`import json`
			`import re`
			`import urllib.parse`

Don't import some nemubot module automatically 2015-01-04 22:57:09 +00:00			`from nemubot.exception import IRCException`
[wip] in modules, changes import to reflect new directory structure 2015-01-03 19:34:44 +00:00			`from nemubot.hooks import hook`
			`from nemubot.tools import web`
New module mediawiki 2014-09-02 19:19:08 +00:00
			`nemubotversion = 3.4`

Response class is now part of 'more' module. This commit prepare the new message flow based on protocol independent messages. This commit changes the module API: you need to import the Response class manually at the begining of our module. 2014-09-26 16:00:22 +00:00			`from more import Response`

PEP8 clean 2014-11-13 01:51:49 +00:00
[mediawiki] Handle # 2015-07-22 18:10:08 +00:00			`# MEDIAWIKI REQUESTS ##################################################`

Mediawiki module: fetch namespaces list to hide categories 2014-10-22 05:38:53 +00:00			`def get_namespaces(site, ssl=False):`
			`# Built URL`
			`url = "http%s://%s/w/api.php?format=json&action=query&meta=siteinfo&siprop=namespaces" % (`
			`"s" if ssl else "", site)`

			`# Make the request`
Modules: global dusting: call getJSON instead of making raw calls to urllib 2014-12-17 06:32:34 +00:00			`data = web.getJSON(url)`
Mediawiki module: fetch namespaces list to hide categories 2014-10-22 05:38:53 +00:00
			`namespaces = dict()`
			`for ns in data["query"]["namespaces"]:`
			`namespaces[data["query"]["namespaces"][ns]["*"]] = data["query"]["namespaces"][ns]`
			`return namespaces`

PEP8 clean 2014-11-13 01:51:49 +00:00
New module mediawiki 2014-09-02 19:19:08 +00:00			`def get_raw_page(site, term, ssl=False):`
Mediawiki module: can search through opensearch or classic search 2014-09-30 22:16:19 +00:00			`# Built URL`
New module mediawiki 2014-09-02 19:19:08 +00:00			`url = "http%s://%s/w/api.php?format=json&redirects&action=query&prop=revisions&rvprop=content&titles=%s" % (`
			`"s" if ssl else "", site, urllib.parse.quote(term))`

			`# Make the request`
Modules: global dusting: call getJSON instead of making raw calls to urllib 2014-12-17 06:32:34 +00:00			`data = web.getJSON(url)`
New module mediawiki 2014-09-02 19:19:08 +00:00
			`for k in data["query"]["pages"]:`
Mediawiki: display an error when the article doesn't exist 2014-09-19 05:59:11 +00:00			`try:`
			`return data["query"]["pages"][k]["revisions"][0]["*"]`
			`except:`
			`raise IRCException("article not found")`
New module mediawiki 2014-09-02 19:19:08 +00:00
PEP8 clean 2014-11-13 01:51:49 +00:00
New module mediawiki 2014-09-02 19:19:08 +00:00			`def get_unwikitextified(site, wikitext, ssl=False):`
Mediawiki module: can search through opensearch or classic search 2014-09-30 22:16:19 +00:00			`# Built URL`
New module mediawiki 2014-09-02 19:19:08 +00:00			`url = "http%s://%s/w/api.php?format=json&action=expandtemplates&text=%s" % (`
			`"s" if ssl else "", site, urllib.parse.quote(wikitext))`

			`# Make the request`
Modules: global dusting: call getJSON instead of making raw calls to urllib 2014-12-17 06:32:34 +00:00			`data = web.getJSON(url)`
New module mediawiki 2014-09-02 19:19:08 +00:00
			`return data["expandtemplates"]["*"]`


[mediawiki] Handle # 2015-07-22 18:10:08 +00:00			`## Search`

			`def opensearch(site, term, ssl=False):`
			`# Built URL`
			`url = "http%s://%s/w/api.php?format=xml&action=opensearch&search=%s" % (`
			`"s" if ssl else "", site, urllib.parse.quote(term))`

			`# Make the request`
			`response = web.getXML(url)`

			`if response is not None and response.hasNode("Section"):`
			`for itm in response.getNode("Section").getNodes("Item"):`
			`yield (itm.getNode("Text").getContent(),`
			`itm.getNode("Description").getContent(),`
			`itm.getNode("Url").getContent())`


			`def search(site, term, ssl=False):`
			`# Built URL`
			`url = "http%s://%s/w/api.php?format=json&action=query&list=search&srsearch=%s&srprop=titlesnippet\|snippet" % (`
			`"s" if ssl else "", site, urllib.parse.quote(term))`

			`# Make the request`
			`data = web.getJSON(url)`

			`if data is not None and "query" in data and "search" in data["query"]:`
			`for itm in data["query"]["search"]:`
			`yield (web.striphtml(itm["titlesnippet"].replace("<span class='searchmatch'>", "\x03\x02").replace("</span>", "\x03\x02")),`
			`web.striphtml(itm["snippet"].replace("<span class='searchmatch'>", "\x03\x02").replace("</span>", "\x03\x02")))`


			`# PARSING FUNCTIONS ###################################################`

New module mediawiki 2014-09-02 19:19:08 +00:00			`def strip_model(cnt):`
[mediawiki] improve output 2015-03-14 00:14:35 +00:00			`# Strip models at begin: mostly useless`
[mediawiki] Improve parsing of recursive templates 2015-04-28 15:02:00 +00:00			`cnt = re.sub(r"^(({{([^{]\|\s\|({{([^{]\|\s\|{{.?}})?}})?)?}}\|\[\[([^[]\|\s\|\[\[.?\]\])?\]\])\s*)+", "", cnt, flags=re.DOTALL)`
[mediawiki] improve output 2015-03-14 00:14:35 +00:00
			`# Remove new line from models`
			`for full in re.findall(r"{{.*?}}", cnt, flags=re.DOTALL):`
			`cnt = cnt.replace(full, full.replace("\n", " "), 1)`

			`# Remove new line after titles`
			`cnt, _ = re.subn(r"((?P<title>==+)\s(.?)\s*(?P=title))\n+", r"\1", cnt)`
New module mediawiki 2014-09-02 19:19:08 +00:00
			`# Strip HTML comments`
[mediawiki] improve output 2015-03-14 00:14:35 +00:00			`cnt = re.sub(r"<!--.*?-->", "", cnt, flags=re.DOTALL)`
New module mediawiki 2014-09-02 19:19:08 +00:00
			`# Strip ref`
[mediawiki] improve output 2015-03-14 00:14:35 +00:00			`cnt = re.sub(r"<ref.*?/ref>", "", cnt, flags=re.DOTALL)`
New module mediawiki 2014-09-02 19:19:08 +00:00			`return cnt`

PEP8 clean 2014-11-13 01:51:49 +00:00
Mediawiki module: fetch namespaces list to hide categories 2014-10-22 05:38:53 +00:00			`def parse_wikitext(site, cnt, namespaces=dict(), ssl=False):`
PEP8 clean 2014-11-13 01:51:49 +00:00			`for i, _, _, _ in re.findall(r"({{([^{]\|\s\|({{(.\|\s\|{{.?}})?}})?)?}})", cnt):`
New module mediawiki 2014-09-02 19:19:08 +00:00			`cnt = cnt.replace(i, get_unwikitextified(site, i, ssl), 1)`

			`# Strip [[...]]`
PEP8 clean 2014-11-13 01:51:49 +00:00			`for full, args, lnk in re.findall(r"(\[\[(.?\|)?([^\|]?)\]\])", cnt):`
Mediawiki module: fetch namespaces list to hide categories 2014-10-22 05:38:53 +00:00			`ns = lnk.find(":")`
			`if lnk == "":`
			`cnt = cnt.replace(full, args[:-1], 1)`
			`elif ns > 0:`
			`namespace = lnk[:ns]`
			`if namespace in namespaces and namespaces[namespace]["canonical"] == "Category":`
			`cnt = cnt.replace(full, "", 1)`
			`continue`
			`cnt = cnt.replace(full, lnk, 1)`
			`else:`
			`cnt = cnt.replace(full, lnk, 1)`
New module mediawiki 2014-09-02 19:19:08 +00:00
			`# Strip HTML tags`
Mediawiki module: can search through opensearch or classic search 2014-09-30 22:16:19 +00:00			`cnt = web.striphtml(cnt)`
New module mediawiki 2014-09-02 19:19:08 +00:00
			`return cnt`

PEP8 clean 2014-11-13 01:51:49 +00:00
[mediawiki] Handle # 2015-07-22 18:10:08 +00:00			`# FORMATING FUNCTIONS #################################################`

New module mediawiki 2014-09-02 19:19:08 +00:00			`def irc_format(cnt):`
[mediawiki] improve output 2015-03-14 00:14:35 +00:00			`cnt, _ = re.subn(r"(?P<title>==+)\s(.?)\s*(?P=title)", "\x03\x16" + r"\2" + " :\x03\x16 ", cnt)`
New module mediawiki 2014-09-02 19:19:08 +00:00			`return cnt.replace("'''", "\x03\x02").replace("''", "\x03\x1f")`

PEP8 clean 2014-11-13 01:51:49 +00:00
[mediawiki] Handle # 2015-07-22 18:10:08 +00:00			`def get_page(site, term, ssl=False, subpart=None):`
			`raw = get_raw_page(site, term, ssl)`
New module mediawiki 2014-09-02 19:19:08 +00:00
[mediawiki] Handle # 2015-07-22 18:10:08 +00:00			`if subpart is not None:`
			`subpart = subpart.replace("_", " ")`
			`raw = re.sub(r"^.(?P<title>==+)\s(" + subpart + r")\s*(?P=title)", r"\1 \2 \1", raw, flags=re.DOTALL)`
New module mediawiki 2014-09-02 19:19:08 +00:00
[mediawiki] Handle # 2015-07-22 18:10:08 +00:00			`return strip_model(raw)`
Mediawiki module: can search through opensearch or classic search 2014-09-30 22:16:19 +00:00

[mediawiki] Handle # 2015-07-22 18:10:08 +00:00			`# NEMUBOT #############################################################`
Mediawiki module: can search through opensearch or classic search 2014-09-30 22:16:19 +00:00
[mediawiki] Handle # 2015-07-22 18:10:08 +00:00			`def mediawiki_response(site, term, receivers):`
			`ns = get_namespaces(site)`
Mediawiki module: can search through opensearch or classic search 2014-09-30 22:16:19 +00:00
[mediawiki] Handle # 2015-07-22 18:10:08 +00:00			`terms = term.split("#", 1)`
Mediawiki module: can search through opensearch or classic search 2014-09-30 22:16:19 +00:00
[mediawiki] Help user find the article he want to read if it doesn't exist 2015-07-23 19:53:20 +00:00			`try:`
			`# Print the article if it exists`
			`return Response(get_page(site, terms[0], subpart=terms[1] if len(terms) > 1 else None),`
			`line_treat=lambda line: irc_format(parse_wikitext(site, line, ns)),`
			`channel=receivers)`
			`except:`
			`# Try looking at opensearch`
			`os = [x for x, _, _ in opensearch(site, terms[0])]`
			`# Fallback to global search`
			`if not len(os):`
			`os = [x for x, _ in search(site, terms[0]) if x is not None and x != ""]`
			`return Response(os,`
			`channel=receivers,`
			`title="Article not found, would you mean")`
Mediawiki module: can search through opensearch or classic search 2014-09-30 22:16:19 +00:00

New message processing 2014-10-05 16:19:20 +00:00			`@hook("cmd_hook", "mediawiki")`
New module mediawiki 2014-09-02 19:19:08 +00:00			`def cmd_mediawiki(msg):`
			`"""Read an article on a MediaWiki"""`
Dusting modules 2015-07-10 21:09:54 +00:00			`if len(msg.args) < 2:`
New module mediawiki 2014-09-02 19:19:08 +00:00			`raise IRCException("indicate a domain and a term to search")`

[mediawiki] Handle # 2015-07-22 18:10:08 +00:00			`return mediawiki_response(msg.args[0],`
			`" ".join(msg.args[1:]),`
			`msg.receivers)`
New module mediawiki 2014-09-02 19:19:08 +00:00

New message processing 2014-10-05 16:19:20 +00:00			`@hook("cmd_hook", "search_mediawiki")`
Mediawiki module: can search through opensearch or classic search 2014-09-30 22:16:19 +00:00			`def cmd_srchmediawiki(msg):`
			`"""Search an article on a MediaWiki"""`
Dusting modules 2015-07-10 21:09:54 +00:00			`if len(msg.args) < 2:`
Mediawiki module: can search through opensearch or classic search 2014-09-30 22:16:19 +00:00			`raise IRCException("indicate a domain and a term to search")`

			`res = Response(channel=msg.receivers, nomore="No more results", count=" (%d more results)")`

Dusting modules 2015-07-10 21:09:54 +00:00			`for r in search(msg.args[0], " ".join(msg.args[1:])):`
Mediawiki module: can search through opensearch or classic search 2014-09-30 22:16:19 +00:00			`res.append_message("%s: %s" % r)`

			`return res`


New message processing 2014-10-05 16:19:20 +00:00			`@hook("cmd_hook", "wikipedia")`
New module mediawiki 2014-09-02 19:19:08 +00:00			`def cmd_wikipedia(msg):`
Dusting modules 2015-07-10 21:09:54 +00:00			`if len(msg.args) < 2:`
New module mediawiki 2014-09-02 19:19:08 +00:00			`raise IRCException("indicate a lang and a term to search")`

[mediawiki] Handle # 2015-07-22 18:10:08 +00:00			`return mediawiki_response(msg.args[0] + ".wikipedia.org",`
			`" ".join(msg.args[1:]),`
			`msg.receivers)`