2014-09-02 19:19:08 +00:00
|
|
|
# coding=utf-8
|
|
|
|
|
|
|
|
"""Use MediaWiki API to get pages"""
|
|
|
|
|
|
|
|
import json
|
|
|
|
import re
|
|
|
|
import urllib.parse
|
|
|
|
|
2015-01-04 22:57:09 +00:00
|
|
|
from nemubot.exception import IRCException
|
2015-01-03 19:34:44 +00:00
|
|
|
from nemubot.hooks import hook
|
|
|
|
from nemubot.tools import web
|
2014-09-02 19:19:08 +00:00
|
|
|
|
|
|
|
nemubotversion = 3.4
|
|
|
|
|
2014-09-26 16:00:22 +00:00
|
|
|
from more import Response
|
|
|
|
|
2014-11-13 01:51:49 +00:00
|
|
|
|
2015-07-22 18:10:08 +00:00
|
|
|
# MEDIAWIKI REQUESTS ##################################################
|
|
|
|
|
2014-10-22 05:38:53 +00:00
|
|
|
def get_namespaces(site, ssl=False):
|
|
|
|
# Built URL
|
|
|
|
url = "http%s://%s/w/api.php?format=json&action=query&meta=siteinfo&siprop=namespaces" % (
|
|
|
|
"s" if ssl else "", site)
|
|
|
|
|
|
|
|
# Make the request
|
2014-12-17 06:32:34 +00:00
|
|
|
data = web.getJSON(url)
|
2014-10-22 05:38:53 +00:00
|
|
|
|
|
|
|
namespaces = dict()
|
|
|
|
for ns in data["query"]["namespaces"]:
|
|
|
|
namespaces[data["query"]["namespaces"][ns]["*"]] = data["query"]["namespaces"][ns]
|
|
|
|
return namespaces
|
|
|
|
|
2014-11-13 01:51:49 +00:00
|
|
|
|
2014-09-02 19:19:08 +00:00
|
|
|
def get_raw_page(site, term, ssl=False):
|
2014-09-30 22:16:19 +00:00
|
|
|
# Built URL
|
2014-09-02 19:19:08 +00:00
|
|
|
url = "http%s://%s/w/api.php?format=json&redirects&action=query&prop=revisions&rvprop=content&titles=%s" % (
|
|
|
|
"s" if ssl else "", site, urllib.parse.quote(term))
|
|
|
|
|
|
|
|
# Make the request
|
2014-12-17 06:32:34 +00:00
|
|
|
data = web.getJSON(url)
|
2014-09-02 19:19:08 +00:00
|
|
|
|
|
|
|
for k in data["query"]["pages"]:
|
2014-09-19 05:59:11 +00:00
|
|
|
try:
|
|
|
|
return data["query"]["pages"][k]["revisions"][0]["*"]
|
|
|
|
except:
|
|
|
|
raise IRCException("article not found")
|
2014-09-02 19:19:08 +00:00
|
|
|
|
2014-11-13 01:51:49 +00:00
|
|
|
|
2014-09-02 19:19:08 +00:00
|
|
|
def get_unwikitextified(site, wikitext, ssl=False):
|
2014-09-30 22:16:19 +00:00
|
|
|
# Built URL
|
2014-09-02 19:19:08 +00:00
|
|
|
url = "http%s://%s/w/api.php?format=json&action=expandtemplates&text=%s" % (
|
|
|
|
"s" if ssl else "", site, urllib.parse.quote(wikitext))
|
|
|
|
|
|
|
|
# Make the request
|
2014-12-17 06:32:34 +00:00
|
|
|
data = web.getJSON(url)
|
2014-09-02 19:19:08 +00:00
|
|
|
|
|
|
|
return data["expandtemplates"]["*"]
|
|
|
|
|
|
|
|
|
2015-07-22 18:10:08 +00:00
|
|
|
## Search
|
|
|
|
|
|
|
|
def opensearch(site, term, ssl=False):
|
|
|
|
# Built URL
|
|
|
|
url = "http%s://%s/w/api.php?format=xml&action=opensearch&search=%s" % (
|
|
|
|
"s" if ssl else "", site, urllib.parse.quote(term))
|
|
|
|
|
|
|
|
# Make the request
|
|
|
|
response = web.getXML(url)
|
|
|
|
|
|
|
|
if response is not None and response.hasNode("Section"):
|
|
|
|
for itm in response.getNode("Section").getNodes("Item"):
|
|
|
|
yield (itm.getNode("Text").getContent(),
|
|
|
|
itm.getNode("Description").getContent(),
|
|
|
|
itm.getNode("Url").getContent())
|
|
|
|
|
|
|
|
|
|
|
|
def search(site, term, ssl=False):
|
|
|
|
# Built URL
|
|
|
|
url = "http%s://%s/w/api.php?format=json&action=query&list=search&srsearch=%s&srprop=titlesnippet|snippet" % (
|
|
|
|
"s" if ssl else "", site, urllib.parse.quote(term))
|
|
|
|
|
|
|
|
# Make the request
|
|
|
|
data = web.getJSON(url)
|
|
|
|
|
|
|
|
if data is not None and "query" in data and "search" in data["query"]:
|
|
|
|
for itm in data["query"]["search"]:
|
|
|
|
yield (web.striphtml(itm["titlesnippet"].replace("<span class='searchmatch'>", "\x03\x02").replace("</span>", "\x03\x02")),
|
|
|
|
web.striphtml(itm["snippet"].replace("<span class='searchmatch'>", "\x03\x02").replace("</span>", "\x03\x02")))
|
|
|
|
|
|
|
|
|
|
|
|
# PARSING FUNCTIONS ###################################################
|
|
|
|
|
2014-09-02 19:19:08 +00:00
|
|
|
def strip_model(cnt):
|
2015-03-14 00:14:35 +00:00
|
|
|
# Strip models at begin: mostly useless
|
2015-04-28 15:02:00 +00:00
|
|
|
cnt = re.sub(r"^(({{([^{]|\s|({{([^{]|\s|{{.*?}})*?}})*?)*?}}|\[\[([^[]|\s|\[\[.*?\]\])*?\]\])\s*)+", "", cnt, flags=re.DOTALL)
|
2015-03-14 00:14:35 +00:00
|
|
|
|
|
|
|
# Remove new line from models
|
|
|
|
for full in re.findall(r"{{.*?}}", cnt, flags=re.DOTALL):
|
|
|
|
cnt = cnt.replace(full, full.replace("\n", " "), 1)
|
|
|
|
|
|
|
|
# Remove new line after titles
|
|
|
|
cnt, _ = re.subn(r"((?P<title>==+)\s*(.*?)\s*(?P=title))\n+", r"\1", cnt)
|
2014-09-02 19:19:08 +00:00
|
|
|
|
|
|
|
# Strip HTML comments
|
2015-03-14 00:14:35 +00:00
|
|
|
cnt = re.sub(r"<!--.*?-->", "", cnt, flags=re.DOTALL)
|
2014-09-02 19:19:08 +00:00
|
|
|
|
|
|
|
# Strip ref
|
2015-03-14 00:14:35 +00:00
|
|
|
cnt = re.sub(r"<ref.*?/ref>", "", cnt, flags=re.DOTALL)
|
2014-09-02 19:19:08 +00:00
|
|
|
return cnt
|
|
|
|
|
2014-11-13 01:51:49 +00:00
|
|
|
|
2014-10-22 05:38:53 +00:00
|
|
|
def parse_wikitext(site, cnt, namespaces=dict(), ssl=False):
|
2014-11-13 01:51:49 +00:00
|
|
|
for i, _, _, _ in re.findall(r"({{([^{]|\s|({{(.|\s|{{.*?}})*?}})*?)*?}})", cnt):
|
2014-09-02 19:19:08 +00:00
|
|
|
cnt = cnt.replace(i, get_unwikitextified(site, i, ssl), 1)
|
|
|
|
|
|
|
|
# Strip [[...]]
|
2014-11-13 01:51:49 +00:00
|
|
|
for full, args, lnk in re.findall(r"(\[\[(.*?|)?([^|]*?)\]\])", cnt):
|
2014-10-22 05:38:53 +00:00
|
|
|
ns = lnk.find(":")
|
|
|
|
if lnk == "":
|
|
|
|
cnt = cnt.replace(full, args[:-1], 1)
|
|
|
|
elif ns > 0:
|
|
|
|
namespace = lnk[:ns]
|
|
|
|
if namespace in namespaces and namespaces[namespace]["canonical"] == "Category":
|
|
|
|
cnt = cnt.replace(full, "", 1)
|
|
|
|
continue
|
|
|
|
cnt = cnt.replace(full, lnk, 1)
|
|
|
|
else:
|
|
|
|
cnt = cnt.replace(full, lnk, 1)
|
2014-09-02 19:19:08 +00:00
|
|
|
|
|
|
|
# Strip HTML tags
|
2014-09-30 22:16:19 +00:00
|
|
|
cnt = web.striphtml(cnt)
|
2014-09-02 19:19:08 +00:00
|
|
|
|
|
|
|
return cnt
|
|
|
|
|
2014-11-13 01:51:49 +00:00
|
|
|
|
2015-07-22 18:10:08 +00:00
|
|
|
# FORMATING FUNCTIONS #################################################
|
|
|
|
|
2014-09-02 19:19:08 +00:00
|
|
|
def irc_format(cnt):
|
2015-03-14 00:14:35 +00:00
|
|
|
cnt, _ = re.subn(r"(?P<title>==+)\s*(.*?)\s*(?P=title)", "\x03\x16" + r"\2" + " :\x03\x16 ", cnt)
|
2014-09-02 19:19:08 +00:00
|
|
|
return cnt.replace("'''", "\x03\x02").replace("''", "\x03\x1f")
|
|
|
|
|
2014-11-13 01:51:49 +00:00
|
|
|
|
2015-07-22 18:10:08 +00:00
|
|
|
def get_page(site, term, ssl=False, subpart=None):
|
|
|
|
raw = get_raw_page(site, term, ssl)
|
2014-09-02 19:19:08 +00:00
|
|
|
|
2015-07-22 18:10:08 +00:00
|
|
|
if subpart is not None:
|
|
|
|
subpart = subpart.replace("_", " ")
|
|
|
|
raw = re.sub(r"^.*(?P<title>==+)\s*(" + subpart + r")\s*(?P=title)", r"\1 \2 \1", raw, flags=re.DOTALL)
|
2014-09-02 19:19:08 +00:00
|
|
|
|
2015-07-22 18:10:08 +00:00
|
|
|
return strip_model(raw)
|
2014-09-30 22:16:19 +00:00
|
|
|
|
|
|
|
|
2015-07-22 18:10:08 +00:00
|
|
|
# NEMUBOT #############################################################
|
2014-09-30 22:16:19 +00:00
|
|
|
|
2015-07-22 18:10:08 +00:00
|
|
|
def mediawiki_response(site, term, receivers):
|
|
|
|
ns = get_namespaces(site)
|
2014-09-30 22:16:19 +00:00
|
|
|
|
2015-07-22 18:10:08 +00:00
|
|
|
terms = term.split("#", 1)
|
2014-09-30 22:16:19 +00:00
|
|
|
|
2015-07-23 19:53:20 +00:00
|
|
|
try:
|
|
|
|
# Print the article if it exists
|
|
|
|
return Response(get_page(site, terms[0], subpart=terms[1] if len(terms) > 1 else None),
|
|
|
|
line_treat=lambda line: irc_format(parse_wikitext(site, line, ns)),
|
|
|
|
channel=receivers)
|
|
|
|
except:
|
|
|
|
# Try looking at opensearch
|
|
|
|
os = [x for x, _, _ in opensearch(site, terms[0])]
|
|
|
|
# Fallback to global search
|
|
|
|
if not len(os):
|
|
|
|
os = [x for x, _ in search(site, terms[0]) if x is not None and x != ""]
|
|
|
|
return Response(os,
|
|
|
|
channel=receivers,
|
|
|
|
title="Article not found, would you mean")
|
2014-09-30 22:16:19 +00:00
|
|
|
|
|
|
|
|
2014-10-05 16:19:20 +00:00
|
|
|
@hook("cmd_hook", "mediawiki")
|
2014-09-02 19:19:08 +00:00
|
|
|
def cmd_mediawiki(msg):
|
|
|
|
"""Read an article on a MediaWiki"""
|
2015-07-10 21:09:54 +00:00
|
|
|
if len(msg.args) < 2:
|
2014-09-02 19:19:08 +00:00
|
|
|
raise IRCException("indicate a domain and a term to search")
|
|
|
|
|
2015-07-22 18:10:08 +00:00
|
|
|
return mediawiki_response(msg.args[0],
|
|
|
|
" ".join(msg.args[1:]),
|
|
|
|
msg.receivers)
|
2014-09-02 19:19:08 +00:00
|
|
|
|
|
|
|
|
2014-10-05 16:19:20 +00:00
|
|
|
@hook("cmd_hook", "search_mediawiki")
|
2014-09-30 22:16:19 +00:00
|
|
|
def cmd_srchmediawiki(msg):
|
|
|
|
"""Search an article on a MediaWiki"""
|
2015-07-10 21:09:54 +00:00
|
|
|
if len(msg.args) < 2:
|
2014-09-30 22:16:19 +00:00
|
|
|
raise IRCException("indicate a domain and a term to search")
|
|
|
|
|
|
|
|
res = Response(channel=msg.receivers, nomore="No more results", count=" (%d more results)")
|
|
|
|
|
2015-07-10 21:09:54 +00:00
|
|
|
for r in search(msg.args[0], " ".join(msg.args[1:])):
|
2014-09-30 22:16:19 +00:00
|
|
|
res.append_message("%s: %s" % r)
|
|
|
|
|
|
|
|
return res
|
|
|
|
|
|
|
|
|
2014-10-05 16:19:20 +00:00
|
|
|
@hook("cmd_hook", "wikipedia")
|
2014-09-02 19:19:08 +00:00
|
|
|
def cmd_wikipedia(msg):
|
2015-07-10 21:09:54 +00:00
|
|
|
if len(msg.args) < 2:
|
2014-09-02 19:19:08 +00:00
|
|
|
raise IRCException("indicate a lang and a term to search")
|
|
|
|
|
2015-07-22 18:10:08 +00:00
|
|
|
return mediawiki_response(msg.args[0] + ".wikipedia.org",
|
|
|
|
" ".join(msg.args[1:]),
|
|
|
|
msg.receivers)
|