Mediawiki module: fetch namespaces list to hide categories

This commit is contained in:
nemunaire 2014-10-22 07:38:53 +02:00
parent 67f6d49fb8
commit 5e87843dda

View File

@ -14,6 +14,21 @@ nemubotversion = 3.4
from more import Response from more import Response
def get_namespaces(site, ssl=False):
# Built URL
url = "http%s://%s/w/api.php?format=json&action=query&meta=siteinfo&siprop=namespaces" % (
"s" if ssl else "", site)
print_debug(url)
# Make the request
raw = urllib.request.urlopen(url)
data = json.loads(raw.read().decode())
namespaces = dict()
for ns in data["query"]["namespaces"]:
namespaces[data["query"]["namespaces"][ns]["*"]] = data["query"]["namespaces"][ns]
return namespaces
def get_raw_page(site, term, ssl=False): def get_raw_page(site, term, ssl=False):
# Built URL # Built URL
url = "http%s://%s/w/api.php?format=json&redirects&action=query&prop=revisions&rvprop=content&titles=%s" % ( url = "http%s://%s/w/api.php?format=json&redirects&action=query&prop=revisions&rvprop=content&titles=%s" % (
@ -55,12 +70,23 @@ def strip_model(cnt):
cnt = re.sub(r"<ref.*?/ref>", "", cnt) cnt = re.sub(r"<ref.*?/ref>", "", cnt)
return cnt return cnt
def parse_wikitext(site, cnt, ssl=False): def parse_wikitext(site, cnt, namespaces=dict(), ssl=False):
for i,_,_,_ in re.findall(r"({{([^{]|\s|({{(.|\s|{{.*?}})*?}})*?)*?}})", cnt): for i,_,_,_ in re.findall(r"({{([^{]|\s|({{(.|\s|{{.*?}})*?}})*?)*?}})", cnt):
cnt = cnt.replace(i, get_unwikitextified(site, i, ssl), 1) cnt = cnt.replace(i, get_unwikitextified(site, i, ssl), 1)
# Strip [[...]] # Strip [[...]]
cnt, _ = re.subn(r"\[\[:?([^]]*\|)?([^]]+?)\]\]", r"\2", cnt) for full,args,lnk in re.findall(r"(\[\[(.*?|)?([^|]*?)\]\])", cnt):
ns = lnk.find(":")
if lnk == "":
cnt = cnt.replace(full, args[:-1], 1)
elif ns > 0:
namespace = lnk[:ns]
if namespace in namespaces and namespaces[namespace]["canonical"] == "Category":
cnt = cnt.replace(full, "", 1)
continue
cnt = cnt.replace(full, lnk, 1)
else:
cnt = cnt.replace(full, lnk, 1)
# Strip HTML tags # Strip HTML tags
cnt = web.striphtml(cnt) cnt = web.striphtml(cnt)
@ -113,8 +139,12 @@ def cmd_mediawiki(msg):
if len(msg.cmds) < 3: if len(msg.cmds) < 3:
raise IRCException("indicate a domain and a term to search") raise IRCException("indicate a domain and a term to search")
return Response(get_page(msg.cmds[1], " ".join(msg.cmds[2:])), site = msg.cmds[1]
line_treat=lambda line: irc_format(parse_wikitext(msg.cmds[1], line)),
ns = get_namespaces(site)
return Response(get_page(site, " ".join(msg.cmds[2:])),
line_treat=lambda line: irc_format(parse_wikitext(msg.cmds[1], line, ns)),
channel=msg.receivers) channel=msg.receivers)
@ -139,6 +169,8 @@ def cmd_wikipedia(msg):
site = msg.cmds[1] + ".wikipedia.org" site = msg.cmds[1] + ".wikipedia.org"
ns = get_namespaces(site)
return Response(get_page(site, " ".join(msg.cmds[2:])), return Response(get_page(site, " ".join(msg.cmds[2:])),
line_treat=lambda line: irc_format(parse_wikitext(site, line)), line_treat=lambda line: irc_format(parse_wikitext(site, line, ns)),
channel=msg.receivers) channel=msg.receivers)