Wiki: better escape of infobox
This commit is contained in:
parent
c4e2c0b66c
commit
2fd395f4bc
@ -10,25 +10,44 @@ class Wikipedia:
|
|||||||
def __init__(self, terms, lang="fr"):
|
def __init__(self, terms, lang="fr"):
|
||||||
self.terms = terms
|
self.terms = terms
|
||||||
self.lang = lang
|
self.lang = lang
|
||||||
self.curRT = -1
|
self.curRT = 0
|
||||||
(res, page) = getPage(terms, self.lang)
|
(res, page) = getPage(terms, self.lang)
|
||||||
if res == http.client.OK or res == http.client.SEE_OTHER:
|
if res == http.client.OK or res == http.client.SEE_OTHER:
|
||||||
self.wres = xmlparser.parse_string(page)
|
self.wres = xmlparser.parse_string(page)
|
||||||
|
if self.wres is None or not (self.wres.hasNode("query") and self.wres.getFirstNode("query").hasNode("pages") and self.wres.getFirstNode("query").getFirstNode("pages").hasNode("page") and self.wres.getFirstNode("query").getFirstNode("pages").getFirstNode("page").hasNode("revisions")):
|
||||||
|
self.wres = None
|
||||||
|
else:
|
||||||
|
self.infobox = parseInfobox(self)
|
||||||
else:
|
else:
|
||||||
self.wres = None
|
self.wres = None
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def nextRes(self):
|
def nextRes(self):
|
||||||
if self.wres is not None and self.wres.hasNode("query"):
|
if self.wres is not None:
|
||||||
if self.wres.getFirstNode("query").hasNode("pages"):
|
|
||||||
if self.wres.getFirstNode("query").getFirstNode("pages").hasNode("page"):
|
|
||||||
if self.wres.getFirstNode("query").getFirstNode("pages").getFirstNode("page").hasNode("revisions"):
|
|
||||||
for cnt in self.wres.getFirstNode("query").getFirstNode("pages").getFirstNode("page").getFirstNode("revisions").getFirstNode("rev").getContent().split("\n"):
|
for cnt in self.wres.getFirstNode("query").getFirstNode("pages").getFirstNode("page").getFirstNode("revisions").getFirstNode("rev").getContent().split("\n"):
|
||||||
|
if self.curRT > 0:
|
||||||
|
self.curRT -= 1
|
||||||
|
continue
|
||||||
|
|
||||||
c = striplink(cnt).strip()
|
c = striplink(cnt).strip()
|
||||||
if c != "":
|
if c != "":
|
||||||
yield c
|
yield c
|
||||||
|
|
||||||
|
|
||||||
|
def parseInfobox(w):
|
||||||
|
inInfobox = False
|
||||||
|
view=-1
|
||||||
|
for cnt in w.wres.getFirstNode("query").getFirstNode("pages").getFirstNode("page").getFirstNode("revisions").getFirstNode("rev").getContent().split("\n"):
|
||||||
|
view += 1
|
||||||
|
if inInfobox:
|
||||||
|
if cnt.find("}}") == 0:
|
||||||
|
inInfobox=False
|
||||||
|
elif cnt.find("{{") == 0:
|
||||||
|
inInfobox=True
|
||||||
|
else:
|
||||||
|
w.curRT += view
|
||||||
|
break
|
||||||
|
|
||||||
def striplink(data):
|
def striplink(data):
|
||||||
p = re.compile(r'(<!--.*-->|\{\{.*\}\}|\[\[[^\]]+\|[^\]]+\|[^\]\|]+\]\])')
|
p = re.compile(r'(<!--.*-->|\{\{.*\}\}|\[\[[^\]]+\|[^\]]+\|[^\]\|]+\]\])')
|
||||||
q = re.compile(r'\[\[([^\]]+)\|([^\]]+)]]')
|
q = re.compile(r'\[\[([^\]]+)\|([^\]]+)]]')
|
||||||
@ -42,8 +61,8 @@ def striplink(data):
|
|||||||
(s, n) = r.subn(r"\1", s)
|
(s, n) = r.subn(r"\1", s)
|
||||||
return s.replace("'''", "*")
|
return s.replace("'''", "*")
|
||||||
|
|
||||||
def getPage(terms, lang):
|
def getPage(terms, lang, site="wikipedia"):
|
||||||
conn = http.client.HTTPConnection(lang + ".wikipedia.org", timeout=5)
|
conn = http.client.HTTPConnection(lang + "." + site + ".org", timeout=5)
|
||||||
try:
|
try:
|
||||||
conn.request("GET", "/w/api.php?format=xml&redirects&action=query&prop=revisions&rvprop=content&rvsection=0&titles=%s" % quote(terms), None, {"User-agent": "Nemubot v3"})
|
conn.request("GET", "/w/api.php?format=xml&redirects&action=query&prop=revisions&rvprop=content&rvsection=0&titles=%s" % quote(terms), None, {"User-agent": "Nemubot v3"})
|
||||||
except socket.gaierror:
|
except socket.gaierror:
|
||||||
|
Loading…
Reference in New Issue
Block a user