2012-07-24 15:56:20 +00:00
|
|
|
# coding=utf-8
|
|
|
|
|
|
|
|
import re
|
|
|
|
from urllib.parse import quote
|
2012-11-07 13:39:47 +00:00
|
|
|
import urllib.request
|
2012-07-24 15:56:20 +00:00
|
|
|
|
2012-08-31 00:58:28 +00:00
|
|
|
import xmlparser
|
2012-07-24 15:56:20 +00:00
|
|
|
|
|
|
|
class Wikipedia:
|
2012-11-07 13:39:47 +00:00
|
|
|
def __init__(self, terms, lang="fr", site="wikipedia.org", section=0):
|
2012-08-31 00:58:28 +00:00
|
|
|
self.terms = terms
|
|
|
|
self.lang = lang
|
2014-08-12 18:10:19 +00:00
|
|
|
self.curRT = section
|
2012-11-07 13:39:47 +00:00
|
|
|
|
|
|
|
raw = urllib.request.urlopen(urllib.request.Request("http://" + self.lang + "." + site + "/w/api.php?format=xml&redirects&action=query&prop=revisions&rvprop=content&titles=%s" % (quote(terms)), headers={"User-agent": "Nemubot v3"}))
|
|
|
|
self.wres = xmlparser.parse_string(raw.read())
|
|
|
|
if self.wres is None or not (self.wres.hasNode("query") and self.wres.getFirstNode("query").hasNode("pages") and self.wres.getFirstNode("query").getFirstNode("pages").hasNode("page") and self.wres.getFirstNode("query").getFirstNode("pages").getFirstNode("page").hasNode("revisions")):
|
2012-08-31 00:58:28 +00:00
|
|
|
self.wres = None
|
2012-11-07 13:39:47 +00:00
|
|
|
else:
|
|
|
|
self.wres = self.wres.getFirstNode("query").getFirstNode("pages").getFirstNode("page").getFirstNode("revisions").getFirstNode("rev").getContent()
|
2014-08-12 18:10:19 +00:00
|
|
|
self.wres = striplink(self.wres)
|
2012-08-31 00:58:28 +00:00
|
|
|
|
|
|
|
@property
|
|
|
|
def nextRes(self):
|
2012-10-04 11:46:11 +00:00
|
|
|
if self.wres is not None:
|
2012-11-07 13:39:47 +00:00
|
|
|
for cnt in self.wres.split("\n"):
|
2012-10-04 11:46:11 +00:00
|
|
|
if self.curRT > 0:
|
|
|
|
self.curRT -= 1
|
|
|
|
continue
|
2012-07-24 15:56:20 +00:00
|
|
|
|
2012-11-07 13:39:47 +00:00
|
|
|
(c, u) = RGXP_s.subn(' ', cnt)
|
|
|
|
c = c.strip()
|
2012-10-04 11:46:11 +00:00
|
|
|
if c != "":
|
|
|
|
yield c
|
|
|
|
|
2012-11-09 10:56:23 +00:00
|
|
|
RGXP_p = re.compile(r"(<!--.*-->|<ref[^>]*/>|<ref[^>]*>[^>]*</ref>|<dfn[^>]*>[^>]*</dfn>|\{\{[^{}]*\}\}|\[\[([^\[\]]*\[\[[^\]\[]*\]\])+[^\[\]]*\]\]|\{\{([^{}]*\{\{[^{}]*\}\}[^{}]*)+\}\}|\{\{([^{}]*\{\{([^{}]*\{\{[^{}]*\}\}[^{}]*)+\}\}[^{}]*)+\}\}|\[\[[^\]|]+(\|[^\]\|]+)*\]\])|#\* ''" + "\n", re.I)
|
2012-11-07 13:39:47 +00:00
|
|
|
RGXP_l = re.compile(r'\{\{(nobr|lang\|[^|}]+)\|([^}]+)\}\}', re.I)
|
2012-11-07 14:37:17 +00:00
|
|
|
RGXP_m = re.compile(r'\{\{pron\|([^|}]+)\|[^}]+\}\}', re.I)
|
2012-11-07 13:39:47 +00:00
|
|
|
RGXP_t = re.compile("==+ *([^=]+) *=+=\n+([^\n])", re.I)
|
|
|
|
RGXP_q = re.compile(r'\[\[([^\[\]|]+)\|([^\]|]+)]]', re.I)
|
|
|
|
RGXP_r = re.compile(r'\[\[([^\[\]|]+)\]\]', re.I)
|
|
|
|
RGXP_s = re.compile(r'\s+')
|
2012-10-04 11:46:11 +00:00
|
|
|
|
2012-11-07 13:39:47 +00:00
|
|
|
def striplink(s):
|
2012-11-07 14:37:17 +00:00
|
|
|
s.replace("{{m}}", "masculin").replace("{{f}}", "feminin").replace("{{n}}", "neutre")
|
|
|
|
(s, n) = RGXP_m.subn(r"[\1]", s)
|
2012-11-07 13:39:47 +00:00
|
|
|
(s, n) = RGXP_l.subn(r"\2", s)
|
2012-10-04 11:46:11 +00:00
|
|
|
|
2012-11-07 13:39:47 +00:00
|
|
|
(s, n) = RGXP_q.subn(r"\1", s)
|
|
|
|
(s, n) = RGXP_r.subn(r"\1", s)
|
2012-10-04 11:46:11 +00:00
|
|
|
|
2012-11-07 13:39:47 +00:00
|
|
|
(s, n) = RGXP_p.subn('', s)
|
2012-11-07 14:37:17 +00:00
|
|
|
if s == "": return s
|
|
|
|
|
|
|
|
(s, n) = RGXP_t.subn("\x03\x16" + r"\1" + " :\x03\x16 " + r"\2", s)
|
2012-11-07 13:39:47 +00:00
|
|
|
return s.replace("'''", "\x03\x02").replace("''", "\x03\x1f")
|