Wiki: better escape of infobox
This commit is contained in:
parent
c4e2c0b66c
commit
2fd395f4bc
@ -10,25 +10,44 @@ class Wikipedia:
|
||||
def __init__(self, terms, lang="fr"):
|
||||
self.terms = terms
|
||||
self.lang = lang
|
||||
self.curRT = -1
|
||||
self.curRT = 0
|
||||
(res, page) = getPage(terms, self.lang)
|
||||
if res == http.client.OK or res == http.client.SEE_OTHER:
|
||||
self.wres = xmlparser.parse_string(page)
|
||||
if self.wres is None or not (self.wres.hasNode("query") and self.wres.getFirstNode("query").hasNode("pages") and self.wres.getFirstNode("query").getFirstNode("pages").hasNode("page") and self.wres.getFirstNode("query").getFirstNode("pages").getFirstNode("page").hasNode("revisions")):
|
||||
self.wres = None
|
||||
else:
|
||||
self.infobox = parseInfobox(self)
|
||||
else:
|
||||
self.wres = None
|
||||
|
||||
@property
|
||||
def nextRes(self):
|
||||
if self.wres is not None and self.wres.hasNode("query"):
|
||||
if self.wres.getFirstNode("query").hasNode("pages"):
|
||||
if self.wres.getFirstNode("query").getFirstNode("pages").hasNode("page"):
|
||||
if self.wres.getFirstNode("query").getFirstNode("pages").getFirstNode("page").hasNode("revisions"):
|
||||
if self.wres is not None:
|
||||
for cnt in self.wres.getFirstNode("query").getFirstNode("pages").getFirstNode("page").getFirstNode("revisions").getFirstNode("rev").getContent().split("\n"):
|
||||
if self.curRT > 0:
|
||||
self.curRT -= 1
|
||||
continue
|
||||
|
||||
c = striplink(cnt).strip()
|
||||
if c != "":
|
||||
yield c
|
||||
|
||||
|
||||
def parseInfobox(w):
|
||||
inInfobox = False
|
||||
view=-1
|
||||
for cnt in w.wres.getFirstNode("query").getFirstNode("pages").getFirstNode("page").getFirstNode("revisions").getFirstNode("rev").getContent().split("\n"):
|
||||
view += 1
|
||||
if inInfobox:
|
||||
if cnt.find("}}") == 0:
|
||||
inInfobox=False
|
||||
elif cnt.find("{{") == 0:
|
||||
inInfobox=True
|
||||
else:
|
||||
w.curRT += view
|
||||
break
|
||||
|
||||
def striplink(data):
|
||||
p = re.compile(r'(<!--.*-->|\{\{.*\}\}|\[\[[^\]]+\|[^\]]+\|[^\]\|]+\]\])')
|
||||
q = re.compile(r'\[\[([^\]]+)\|([^\]]+)]]')
|
||||
@ -42,8 +61,8 @@ def striplink(data):
|
||||
(s, n) = r.subn(r"\1", s)
|
||||
return s.replace("'''", "*")
|
||||
|
||||
def getPage(terms, lang):
|
||||
conn = http.client.HTTPConnection(lang + ".wikipedia.org", timeout=5)
|
||||
def getPage(terms, lang, site="wikipedia"):
|
||||
conn = http.client.HTTPConnection(lang + "." + site + ".org", timeout=5)
|
||||
try:
|
||||
conn.request("GET", "/w/api.php?format=xml&redirects&action=query&prop=revisions&rvprop=content&rvsection=0&titles=%s" % quote(terms), None, {"User-agent": "Nemubot v3"})
|
||||
except socket.gaierror:
|
||||
|
Loading…
Reference in New Issue
Block a user