Use url.request in DDG module; fix issue #1: output for Wiki is better

2012-11-07 14:39:47 +01:00 · 2012-11-07 14:39:47 +01:00 · c3c697bdab
commit c3c697bdab
parent 6f6ddd4d1e
4 changed files with 49 additions and 55 deletions
--- a/modules/ddg/DDGSearch.py
+++ b/modules/ddg/DDGSearch.py
@ -1,13 +1,17 @@
 # coding=utf-8

 from urllib.parse import quote
+from urllib.request import urlopen

+import xmlparser
 from tools import web

 class DDGSearch:
    def __init__(self, terms):
        self.terms = terms
-        self.ddgres = web.getXML("http://api.duckduckgo.com/?q=%s&format=xml" % quote(terms))
+
+        raw = urlopen("https://api.duckduckgo.com/?q=%s&format=xml" % quote(terms), timeout=10)
+        self.ddgres = xmlparser.parse_string(raw.read())

    @property
    def type(self):
--- a/modules/ddg/WFASearch.py
+++ b/modules/ddg/WFASearch.py
@ -1,17 +1,19 @@
 # coding=utf-8

 from urllib.parse import quote
+from urllib.request import urlopen

-from tools import web
+import xmlparser

 class WFASearch:
    def __init__(self, terms):
        self.terms = terms
        try:
-            self.wfares = web.getXML("http://api.wolframalpha.com/v2/query?"
+            raw = urlopen("http://api.wolframalpha.com/v2/query?"
                                     "input=%s&appid=%s"
                                     % (quote(terms),
-                                        CONF.getNode("wfaapi")["key"]))
+                                    CONF.getNode("wfaapi")["key"]), timeout=15)
+            self.wfares = xmlparser.parse_string(raw.read())
        except (TypeError, KeyError):
            print ("You need a Wolfram|Alpha API key in order to use this "
                   "module. Add it to the module configuration file:\n<wfaapi"
--- a/modules/ddg/Wikipedia.py
+++ b/modules/ddg/Wikipedia.py
@ -1,76 +1,55 @@
 # coding=utf-8

-import http.client
 import re
 from urllib.parse import quote
+import urllib.request

 import xmlparser

 class Wikipedia:
-    def __init__(self, terms, lang="fr"):
+    def __init__(self, terms, lang="fr", site="wikipedia.org", section=0):
        self.terms = terms
        self.lang = lang
        self.curRT = 0
-        (res, page) = getPage(terms, self.lang)
-        if res == http.client.OK or res == http.client.SEE_OTHER:
-            self.wres = xmlparser.parse_string(page)
-            if self.wres is None or not (self.wres.hasNode("query") and self.wres.getFirstNode("query").hasNode("pages") and self.wres.getFirstNode("query").getFirstNode("pages").hasNode("page") and self.wres.getFirstNode("query").getFirstNode("pages").getFirstNode("page").hasNode("revisions")):
-                self.wres = None
-            else:
-                self.infobox = parseInfobox(self)
-        else:
+
+        raw = urllib.request.urlopen(urllib.request.Request("http://" + self.lang + "." + site + "/w/api.php?format=xml&redirects&action=query&prop=revisions&rvprop=content&titles=%s" % (quote(terms)), headers={"User-agent": "Nemubot v3"}))
+        self.wres = xmlparser.parse_string(raw.read())
+        if self.wres is None or not (self.wres.hasNode("query") and self.wres.getFirstNode("query").hasNode("pages") and self.wres.getFirstNode("query").getFirstNode("pages").hasNode("page") and self.wres.getFirstNode("query").getFirstNode("pages").getFirstNode("page").hasNode("revisions")):
            self.wres = None
+        else:
+            self.wres = self.wres.getFirstNode("query").getFirstNode("pages").getFirstNode("page").getFirstNode("revisions").getFirstNode("rev").getContent()
+            self.wres = striplink(self.wres) 

    @property
    def nextRes(self):
        if self.wres is not None:
-            for cnt in self.wres.getFirstNode("query").getFirstNode("pages").getFirstNode("page").getFirstNode("revisions").getFirstNode("rev").getContent().split("\n"):
+            for cnt in self.wres.split("\n"):
                if self.curRT > 0:
                    self.curRT -= 1
                    continue

-                c = striplink(cnt).strip()
+                (c, u) = RGXP_s.subn(' ', cnt)
+                c = c.strip()
                if c != "":
                    yield c

+RGXP_p = re.compile(r'(<!--.*-->|<ref[^>]*/>|<ref[^>]*>[^>]*</ref>|<dfn[^>]*>[^>]*</dfn>|\{\{[^}]*\}\}|\[\[([^\[\]]*\[\[[^\]\[]*\]\])+[^\[\]]*\]\]|\{\{([^{}]*\{\{.*\}\}[^{}]*)+\}\}|\[\[[^\]|]+(\|[^\]\|]+)*\]\])', re.I)
+RGXP_l = re.compile(r'\{\{(nobr|lang\|[^|}]+)\|([^}]+)\}\}', re.I)
+RGXP_t = re.compile("==+ *([^=]+) *=+=\n+([^\n])", re.I)
+RGXP_q = re.compile(r'\[\[([^\[\]|]+)\|([^\]|]+)]]', re.I)
+RGXP_r = re.compile(r'\[\[([^\[\]|]+)\]\]', re.I)
+RGXP_s = re.compile(r'\s+')

-def parseInfobox(w):
-    inInfobox = False
-    view=-1
-    for cnt in w.wres.getFirstNode("query").getFirstNode("pages").getFirstNode("page").getFirstNode("revisions").getFirstNode("rev").getContent().split("\n"):
-        view += 1
-        if inInfobox:
-            if cnt.find("}}") == 0:
-                inInfobox=False
-        elif cnt.find("{{") == 0:
-            inInfobox=True
-        else:
-            w.curRT += view
-            break
+def striplink(s):
+    (s, n) = RGXP_l.subn(r"\2", s)
+    if s == "": return s

-def striplink(data):
-    p = re.compile(r'(<!--.*-->|\{\{.*\}\}|\[\[[^\]]+\|[^\]]+\|[^\]\|]+\]\])')
-    q = re.compile(r'\[\[([^\]]+)\|([^\]]+)]]')
-    r = re.compile(r'\[\[([^\]]+)\]\]')
-    (s, n) = p.subn('', data)
-    if s == "":
-        return s
-    (s, n) = q.subn(r"\1", s)
-    if s == "":
-        return s
-    (s, n) = r.subn(r"\1", s)
-    return s.replace("'''", "*")
+    (s, n) = RGXP_q.subn(r"\1", s)
+    if s == "": return s

-def getPage(terms, lang, site="wikipedia"):
-    conn = http.client.HTTPConnection(lang + "." + site + ".org", timeout=5)
-    try:
-        conn.request("GET", "/w/api.php?format=xml&redirects&action=query&prop=revisions&rvprop=content&rvsection=0&titles=%s" % quote(terms), None, {"User-agent": "Nemubot v3"})
-    except socket.gaierror:
-        print ("impossible de récupérer la page %s."%(p))
-        return (http.client.INTERNAL_SERVER_ERROR, None)
+    (s, n) = RGXP_r.subn(r"\1", s)
+    if s == "": return s

-    res = conn.getresponse()
-    data = res.read()
-
-    conn.close()
-    return (res.status, data)
+    (s, n) = RGXP_p.subn('', s)
+    (s, n) = RGXP_t.subn("\x03\x16" + r"\1" + " :\x03\x16  " + r"\2", s)
+    return s.replace("'''", "\x03\x02").replace("''", "\x03\x1f")
--- a/modules/ddg/init.py
+++ b/modules/ddg/init.py
@ -25,6 +25,7 @@ def load(context):
    add_hook("cmd_hook", Hook(calculate, "wa"))
    add_hook("cmd_hook", Hook(calculate, "wfa"))
    add_hook("cmd_hook", Hook(calculate, "calc"))
+    add_hook("cmd_hook", Hook(wiki, "dico"))
    add_hook("cmd_hook", Hook(wiki, "w"))
    add_hook("cmd_hook", Hook(wiki, "wf"))
    add_hook("cmd_hook", Hook(wiki, "wfr"))
@ -98,12 +99,20 @@ def wiki(msg):
        return Response(msg.sender,
                        "Indicate a term to search",
                        msg.channel, nick=msg.nick)
-    if msg.cmds[0] == "w" or msg.cmds[0] == "wf" or msg.cmds[0] == "wfr":
+    if msg.cmds[0] == "dico":
        lang = "fr"
+        site = "wiktionary.org"
+        section = 1
+    elif msg.cmds[0] == "w" or msg.cmds[0] == "wf" or msg.cmds[0] == "wfr":
+        lang = "fr"
+        site = "wikipedia.org"
+        section = 0
    else:
        lang = "en"
+        site = "wikipedia.org"
+        section = 0

-    s = Wikipedia.Wikipedia(' '.join(msg.cmds[1:]), lang)
+    s = Wikipedia.Wikipedia(' '.join(msg.cmds[1:]), lang, site, section)

    res = Response(msg.sender, channel=msg.channel, nomore="No more results")
    for result in s.nextRes: