Wiki: better escape of infobox

This commit is contained in:
Némunaire 2012-10-04 13:46:11 +02:00
parent c4e2c0b66c
commit 2fd395f4bc

View File

@ -10,48 +10,67 @@ class Wikipedia:
def __init__(self, terms, lang="fr"):
self.terms = terms
self.lang = lang
self.curRT = -1
self.curRT = 0
(res, page) = getPage(terms, self.lang)
if res == http.client.OK or res == http.client.SEE_OTHER:
self.wres = xmlparser.parse_string(page)
if self.wres is None or not (self.wres.hasNode("query") and self.wres.getFirstNode("query").hasNode("pages") and self.wres.getFirstNode("query").getFirstNode("pages").hasNode("page") and self.wres.getFirstNode("query").getFirstNode("pages").getFirstNode("page").hasNode("revisions")):
self.wres = None
else:
self.infobox = parseInfobox(self)
else:
self.wres = None
@property
def nextRes(self):
if self.wres is not None and self.wres.hasNode("query"):
if self.wres.getFirstNode("query").hasNode("pages"):
if self.wres.getFirstNode("query").getFirstNode("pages").hasNode("page"):
if self.wres.getFirstNode("query").getFirstNode("pages").getFirstNode("page").hasNode("revisions"):
for cnt in self.wres.getFirstNode("query").getFirstNode("pages").getFirstNode("page").getFirstNode("revisions").getFirstNode("rev").getContent().split("\n"):
c = striplink(cnt).strip()
if c != "":
yield c
if self.wres is not None:
for cnt in self.wres.getFirstNode("query").getFirstNode("pages").getFirstNode("page").getFirstNode("revisions").getFirstNode("rev").getContent().split("\n"):
if self.curRT > 0:
self.curRT -= 1
continue
c = striplink(cnt).strip()
if c != "":
yield c
def parseInfobox(w):
inInfobox = False
view=-1
for cnt in w.wres.getFirstNode("query").getFirstNode("pages").getFirstNode("page").getFirstNode("revisions").getFirstNode("rev").getContent().split("\n"):
view += 1
if inInfobox:
if cnt.find("}}") == 0:
inInfobox=False
elif cnt.find("{{") == 0:
inInfobox=True
else:
w.curRT += view
break
def striplink(data):
p = re.compile(r'(<!--.*-->|\{\{.*\}\}|\[\[[^\]]+\|[^\]]+\|[^\]\|]+\]\])')
q = re.compile(r'\[\[([^\]]+)\|([^\]]+)]]')
r = re.compile(r'\[\[([^\]]+)\]\]')
(s, n) = p.subn('', data)
if s == "":
return s
(s, n) = q.subn(r"\1", s)
if s == "":
return s
(s, n) = r.subn(r"\1", s)
return s.replace("'''", "*")
p = re.compile(r'(<!--.*-->|\{\{.*\}\}|\[\[[^\]]+\|[^\]]+\|[^\]\|]+\]\])')
q = re.compile(r'\[\[([^\]]+)\|([^\]]+)]]')
r = re.compile(r'\[\[([^\]]+)\]\]')
(s, n) = p.subn('', data)
if s == "":
return s
(s, n) = q.subn(r"\1", s)
if s == "":
return s
(s, n) = r.subn(r"\1", s)
return s.replace("'''", "*")
def getPage(terms, lang):
conn = http.client.HTTPConnection(lang + ".wikipedia.org", timeout=5)
try:
conn.request("GET", "/w/api.php?format=xml&redirects&action=query&prop=revisions&rvprop=content&rvsection=0&titles=%s" % quote(terms), None, {"User-agent": "Nemubot v3"})
except socket.gaierror:
print ("impossible de récupérer la page %s."%(p))
return (http.client.INTERNAL_SERVER_ERROR, None)
def getPage(terms, lang, site="wikipedia"):
conn = http.client.HTTPConnection(lang + "." + site + ".org", timeout=5)
try:
conn.request("GET", "/w/api.php?format=xml&redirects&action=query&prop=revisions&rvprop=content&rvsection=0&titles=%s" % quote(terms), None, {"User-agent": "Nemubot v3"})
except socket.gaierror:
print ("impossible de récupérer la page %s."%(p))
return (http.client.INTERNAL_SERVER_ERROR, None)
res = conn.getresponse()
data = res.read()
res = conn.getresponse()
data = res.read()
conn.close()
return (res.status, data)
conn.close()
return (res.status, data)