New module mediawiki
This commit is contained in:
parent
3bc53bb4ef
commit
7387fabee1
3 changed files with 92 additions and 107 deletions
|
@ -1,56 +0,0 @@
|
||||||
# coding=utf-8
|
|
||||||
|
|
||||||
import re
|
|
||||||
from urllib.parse import quote
|
|
||||||
import urllib.request
|
|
||||||
|
|
||||||
import xmlparser
|
|
||||||
|
|
||||||
class Wikipedia:
|
|
||||||
def __init__(self, terms, lang="fr", site="wikipedia.org", section=0):
|
|
||||||
self.terms = terms
|
|
||||||
self.lang = lang
|
|
||||||
self.curRT = section
|
|
||||||
|
|
||||||
raw = urllib.request.urlopen(urllib.request.Request("http://" + self.lang + "." + site + "/w/api.php?format=xml&redirects&action=query&prop=revisions&rvprop=content&titles=%s" % (quote(terms)), headers={"User-agent": "Nemubot v3"}))
|
|
||||||
self.wres = xmlparser.parse_string(raw.read())
|
|
||||||
if self.wres is None or not (self.wres.hasNode("query") and self.wres.getFirstNode("query").hasNode("pages") and self.wres.getFirstNode("query").getFirstNode("pages").hasNode("page") and self.wres.getFirstNode("query").getFirstNode("pages").getFirstNode("page").hasNode("revisions")):
|
|
||||||
self.wres = None
|
|
||||||
else:
|
|
||||||
self.wres = self.wres.getFirstNode("query").getFirstNode("pages").getFirstNode("page").getFirstNode("revisions").getFirstNode("rev").getContent()
|
|
||||||
self.wres = striplink(self.wres)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def nextRes(self):
|
|
||||||
if self.wres is not None:
|
|
||||||
for cnt in self.wres.split("\n"):
|
|
||||||
if self.curRT > 0:
|
|
||||||
self.curRT -= 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
(c, u) = RGXP_s.subn(' ', cnt)
|
|
||||||
c = c.strip()
|
|
||||||
if c != "":
|
|
||||||
yield c
|
|
||||||
|
|
||||||
RGXP_p = re.compile(r"(<!--.*-->|<ref[^>]*/>|<ref[^>]*>[^>]*</ref>|<dfn[^>]*>[^>]*</dfn>|\{\{[^{}]*\}\}|\[\[([^\[\]]*\[\[[^\]\[]*\]\])+[^\[\]]*\]\]|\{\{([^{}]*\{\{[^{}]*\}\}[^{}]*)+\}\}|\{\{([^{}]*\{\{([^{}]*\{\{[^{}]*\}\}[^{}]*)+\}\}[^{}]*)+\}\}|\[\[[^\]|]+(\|[^\]\|]+)*\]\])|#\* ''" + "\n", re.I)
|
|
||||||
RGXP_l = re.compile(r'\{\{(nobr|lang\|[^|}]+)\|([^}]+)\}\}', re.I)
|
|
||||||
RGXP_m = re.compile(r'\{\{pron\|([^|}]+)\|[^}]+\}\}', re.I)
|
|
||||||
RGXP_t = re.compile("==+ *([^=]+) *=+=\n+([^\n])", re.I)
|
|
||||||
RGXP_q = re.compile(r'\[\[([^\[\]|]+)\|([^\]|]+)]]', re.I)
|
|
||||||
RGXP_r = re.compile(r'\[\[([^\[\]|]+)\]\]', re.I)
|
|
||||||
RGXP_s = re.compile(r'\s+')
|
|
||||||
|
|
||||||
def striplink(s):
|
|
||||||
s.replace("{{m}}", "masculin").replace("{{f}}", "feminin").replace("{{n}}", "neutre")
|
|
||||||
(s, n) = RGXP_m.subn(r"[\1]", s)
|
|
||||||
(s, n) = RGXP_l.subn(r"\2", s)
|
|
||||||
|
|
||||||
(s, n) = RGXP_q.subn(r"\1", s)
|
|
||||||
(s, n) = RGXP_r.subn(r"\1", s)
|
|
||||||
|
|
||||||
(s, n) = RGXP_p.subn('', s)
|
|
||||||
if s == "": return s
|
|
||||||
|
|
||||||
(s, n) = RGXP_t.subn("\x03\x16" + r"\1" + " :\x03\x16 " + r"\2", s)
|
|
||||||
return s.replace("'''", "\x03\x02").replace("''", "\x03\x1f")
|
|
|
@ -11,7 +11,6 @@ nemubotversion = 3.4
|
||||||
from . import DDGSearch
|
from . import DDGSearch
|
||||||
from . import UrbanDictionnary
|
from . import UrbanDictionnary
|
||||||
from . import WFASearch
|
from . import WFASearch
|
||||||
from . import Wikipedia
|
|
||||||
|
|
||||||
def load(context):
|
def load(context):
|
||||||
global CONF
|
global CONF
|
||||||
|
@ -98,53 +97,3 @@ def calculate(msg):
|
||||||
return res
|
return res
|
||||||
else:
|
else:
|
||||||
return Response(msg.sender, s.error, msg.channel)
|
return Response(msg.sender, s.error, msg.channel)
|
||||||
|
|
||||||
|
|
||||||
@hook("cmd_hook", "wikipedia")
|
|
||||||
def wikipedia(msg):
|
|
||||||
return wiki("wikipedia.org", 0, msg)
|
|
||||||
|
|
||||||
@hook("cmd_hook", "wiktionary")
|
|
||||||
def wiktionary(msg):
|
|
||||||
return wiki("wiktionary.org", 1, msg)
|
|
||||||
|
|
||||||
@hook("cmd_hook", "etymology")
|
|
||||||
def wiktionary(msg):
|
|
||||||
return wiki("wiktionary.org", 0, msg)
|
|
||||||
|
|
||||||
def wiki(site, section, msg):
|
|
||||||
if len(msg.cmds) <= 1:
|
|
||||||
return Response(msg.sender,
|
|
||||||
"Indicate a term to search",
|
|
||||||
msg.channel, nick=msg.nick)
|
|
||||||
if len(msg.cmds) > 2 and len(msg.cmds[1]) < 4:
|
|
||||||
lang = msg.cmds[1]
|
|
||||||
extract = 2
|
|
||||||
else:
|
|
||||||
lang = "fr"
|
|
||||||
extract = 1
|
|
||||||
|
|
||||||
s = Wikipedia.Wikipedia(' '.join(msg.cmds[extract:]), lang, site, section)
|
|
||||||
|
|
||||||
res = Response(msg.sender, channel=msg.channel, nomore="No more results")
|
|
||||||
if site == "wiktionary.org":
|
|
||||||
tout = [result for result in s.nextRes if result.find("\x03\x16 :\x03\x16 ") != 0]
|
|
||||||
if len(tout) > 0:
|
|
||||||
defI=1
|
|
||||||
for t in tout:
|
|
||||||
if t.find("# ") == 0:
|
|
||||||
t = t.replace("# ", "%d. " % defI)
|
|
||||||
defI += 1
|
|
||||||
elif t.find("#* ") == 0:
|
|
||||||
t = t.replace("#* ", " * ")
|
|
||||||
res.append_message(t)
|
|
||||||
else:
|
|
||||||
for result in s.nextRes:
|
|
||||||
res.append_message(result)
|
|
||||||
|
|
||||||
if len(res.messages) > 0:
|
|
||||||
return res
|
|
||||||
else:
|
|
||||||
return Response(msg.sender,
|
|
||||||
"No information about " + " ".join(msg.cmds[extract:]),
|
|
||||||
msg.channel)
|
|
||||||
|
|
92
modules/mediawiki.py
Normal file
92
modules/mediawiki.py
Normal file
|
@ -0,0 +1,92 @@
|
||||||
|
# coding=utf-8
|
||||||
|
|
||||||
|
"""Use MediaWiki API to get pages"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import urllib.parse
|
||||||
|
import urllib.request
|
||||||
|
|
||||||
|
from hooks import hook
|
||||||
|
from tools.web import striphtml
|
||||||
|
|
||||||
|
nemubotversion = 3.4
|
||||||
|
|
||||||
|
def get_raw_page(site, term, ssl=False):
|
||||||
|
# Built IRL
|
||||||
|
url = "http%s://%s/w/api.php?format=json&redirects&action=query&prop=revisions&rvprop=content&titles=%s" % (
|
||||||
|
"s" if ssl else "", site, urllib.parse.quote(term))
|
||||||
|
print_debug(url)
|
||||||
|
|
||||||
|
# Make the request
|
||||||
|
raw = urllib.request.urlopen(url)
|
||||||
|
data = json.loads(raw.read().decode())
|
||||||
|
|
||||||
|
for k in data["query"]["pages"]:
|
||||||
|
return data["query"]["pages"][k]["revisions"][0]["*"]
|
||||||
|
|
||||||
|
def get_unwikitextified(site, wikitext, ssl=False):
|
||||||
|
# Built IRL
|
||||||
|
url = "http%s://%s/w/api.php?format=json&action=expandtemplates&text=%s" % (
|
||||||
|
"s" if ssl else "", site, urllib.parse.quote(wikitext))
|
||||||
|
print_debug(url)
|
||||||
|
|
||||||
|
# Make the request
|
||||||
|
raw = urllib.request.urlopen(url)
|
||||||
|
data = json.loads(raw.read().decode())
|
||||||
|
|
||||||
|
return data["expandtemplates"]["*"]
|
||||||
|
|
||||||
|
|
||||||
|
def strip_model(cnt):
|
||||||
|
# Strip models at begin and end: mostly useless
|
||||||
|
cnt = re.sub(r"^(({{([^{]|\s|({{(.|\s|{{.*?}})*?}})*?)*?}}|\[\[(.|\s|\[\[.*?\]\])*?\]\])\s*)+", "", cnt)
|
||||||
|
#cnt = re.sub(r"({{([^{]|\s|({{(.|\s|{{.*?}})*?}})*?)*?}}\s?)+$", "", cnt)
|
||||||
|
|
||||||
|
# Strip HTML comments
|
||||||
|
cnt = re.sub(r"<!--.*?-->", "", cnt)
|
||||||
|
|
||||||
|
# Strip ref
|
||||||
|
cnt = re.sub(r"<ref.*?/ref>", "", cnt)
|
||||||
|
return cnt
|
||||||
|
|
||||||
|
def parse_wikitext(site, cnt, ssl=False):
|
||||||
|
for i,_,_,_ in re.findall(r"({{([^{]|\s|({{(.|\s|{{.*?}})*?}})*?)*?}})", cnt):
|
||||||
|
cnt = cnt.replace(i, get_unwikitextified(site, i, ssl), 1)
|
||||||
|
|
||||||
|
# Strip [[...]]
|
||||||
|
cnt, _ = re.subn(r"\[\[([^]]*\|)?([^]]*?)\]\]", r"\2", cnt)
|
||||||
|
|
||||||
|
# Strip HTML tags
|
||||||
|
cnt = striphtml(cnt)
|
||||||
|
|
||||||
|
return cnt
|
||||||
|
|
||||||
|
def irc_format(cnt):
|
||||||
|
cnt, _ = re.subn(r"(?P<title>==+)\s*(.*?)\s*(?P=title)\n+", "\x03\x16" + r"\2" + " :\x03\x16 ", cnt)
|
||||||
|
return cnt.replace("'''", "\x03\x02").replace("''", "\x03\x1f")
|
||||||
|
|
||||||
|
def get_page(site, term, ssl=False):
|
||||||
|
return strip_model(get_raw_page(site, term, ssl))
|
||||||
|
|
||||||
|
|
||||||
|
@hook("in_PRIVMSG_cmd", "mediawiki")
|
||||||
|
def cmd_mediawiki(msg):
|
||||||
|
"""Read an article on a MediaWiki"""
|
||||||
|
if len(msg.cmds) < 3:
|
||||||
|
raise IRCException("indicate a domain and a term to search")
|
||||||
|
|
||||||
|
return Response(msg.sender,
|
||||||
|
get_page(msg.cmds[1], " ".join(msg.cmds[2:])),
|
||||||
|
channel=msg.receivers)
|
||||||
|
|
||||||
|
|
||||||
|
@hook("in_PRIVMSG_cmd", "wikipedia")
|
||||||
|
def cmd_wikipedia(msg):
|
||||||
|
if len(msg.cmds) < 3:
|
||||||
|
raise IRCException("indicate a lang and a term to search")
|
||||||
|
|
||||||
|
return Response(msg.sender,
|
||||||
|
get_page(msg.cmds[1] + ".wikipedia.org", " ".join(msg.cmds[2:])),
|
||||||
|
channel=msg.receivers,
|
||||||
|
line_treat=lambda line: irc_format(parse_wikitext(site, line, ssl)))
|
Loading…
Add table
Add a link
Reference in a new issue