tools/web: factorize getNormalizedURL

2015-10-14 00:17:02 +02:00 · 2015-10-14 00:17:02 +02:00 · 55e6550cb1
commit 55e6550cb1
parent 76ec0d26b4
7 changed files with 26 additions and 21 deletions
--- a/modules/framalink.py
+++ b/modules/framalink.py
@ -34,7 +34,7 @@ PROVIDERS = {
 }
 DEFAULT_PROVIDER = "framalink"

-PROVIDERS_NETLOC = [urlparse(url, "http").netloc for f, url in PROVIDERS.values()]
+PROVIDERS_NETLOC = [urlparse(web.getNormalizedURL(url), "http").netloc for f, url in PROVIDERS.values()]

 # LOADING #############################################################

@ -82,7 +82,7 @@ def parseresponse(msg):
    if hasattr(msg, "text") and msg.text:
        urls = re.findall("([a-zA-Z0-9+.-]+:(?://)?[^ :]+)", msg.text)
        for url in urls:
-            o = urlparse(url, "http")
+            o = urlparse(web._getNormalizedURL(url), "http")

            # Skip short URLs
            if o.netloc == "" or o.netloc in PROVIDERS or len(o.netloc) + len(o.path) < 17:
@ -118,7 +118,7 @@ def cmd_reduceurl(msg):

    res = list()
    for url in minify:
-        o = urlparse(url, "http")
+        o = urlparse(web.getNormalizedURL(url), "http")
        minief_url = reduce(url)
        if o.netloc == "":
            res.append(gen_response(minief_url, msg, o.scheme))
--- a/modules/networking/isup.py
+++ b/modules/networking/isup.py
@ -1,6 +1,6 @@
 import urllib

-from nemubot.tools.web import getJSON
+from nemubot.tools.web import getNormalizedURL, getJSON

 def isup(url):
    """Determine if the given URL is up or not
@ -9,7 +9,7 @@ def isup(url):
    url -- the URL to check
    """

-    o = urllib.parse.urlparse(url, "http")
+    o = urllib.parse.urlparse(getNormalizedURL(url), "http")
    if o.netloc != "":
        isup = getJSON("http://isitup.org/%s.json" % o.netloc)
        if isup is not None and "status_code" in isup and isup["status_code"] == 1:
--- a/modules/networking/page.py
+++ b/modules/networking/page.py
@ -21,7 +21,7 @@ def headers(url):
    url -- the page URL to get header
    """

-    o = urllib.parse.urlparse(url, "http")
+    o = urllib.parse.urlparse(web.getNormalizedURL(url), "http")
    if o.netloc == "":
        raise IRCException("invalid URL")
    if o.scheme == "http":
--- a/modules/networking/w3c.py
+++ b/modules/networking/w3c.py
@ -3,6 +3,7 @@ import urllib

 from nemubot import __version__
 from nemubot.exception import IRCException
+from nemubot.tools.web import getNormalizedURL

 def validator(url):
    """Run the w3c validator on the given URL
@ -11,7 +12,7 @@ def validator(url):
    url -- the URL to validate
    """

-    o = urllib.parse.urlparse(url, "http")
+    o = urllib.parse.urlparse(getNormalizedURL(url), "http")
    if o.netloc == "":
        raise IRCException("Indicate a valid URL!")

--- a/modules/networking/watchWebsite.py
+++ b/modules/networking/watchWebsite.py
@ -8,10 +8,10 @@ from urllib.parse import urlparse
 from nemubot.event import ModuleEvent
 from nemubot.exception import IRCException
 from nemubot.hooks import hook
+from nemubot.tools.web import getNormalizedURL
 from nemubot.tools.xmlparser.node import ModuleState

 logger = logging.getLogger("nemubot.module.networking.watchWebsite")
-nemubotversion = 3.4

 from more import Response

@ -56,7 +56,7 @@ def del_site(url, nick, channel, frm_owner):
    url -- URL to unwatch
    """

-    o = urlparse(url, "http")
+    o = urlparse(getNormalizedURL(url), "http")
    if o.scheme != "" and url in DATAS.index:
        site = DATAS.index[url]
        for a in site.getNodes("alert"):
@ -80,7 +80,7 @@ def add_site(url, nick, channel, server, diffType="diff"):
    url -- URL to watch
    """

-    o = urlparse(url, "http")
+    o = urlparse(getNormalizedURL(url), "http")
    if o.netloc == "":
        raise IRCException("sorry, I can't watch this URL :(")

@ -210,7 +210,7 @@ def start_watching(site, offset=0):
    offset -- offset time to delay the launch of the first check
    """

-    o = urlparse(site["url"], "http")
+    o = urlparse(getNormalizedURL(site["url"]), "http")
    #print_debug("Add %s event for site: %s" % (site["type"], o.netloc))

    try:
--- a/modules/youtube-title.py
+++ b/modules/youtube-title.py
@ -3,7 +3,7 @@ import re, json, subprocess

 from nemubot.exception import IRCException
 from nemubot.hooks import hook
-from nemubot.tools.web import getURLContent
+from nemubot.tools.web import _getNormalizedURL, getURLContent
 from more import Response

 """Get information of youtube videos"""
@ -85,7 +85,7 @@ def parseresponse(msg):
    if hasattr(msg, "text") and msg.text:
      urls = re.findall("([a-zA-Z0-9+.-]+:(?://)?[^ :]+)", msg.text)
      for url in urls:
-        o = urlparse(url)
+        o = urlparse(_getNormalizedURL(url))
        if o.scheme != "":
          if o.netloc == "" and len(o.path) < 10:
            continue
--- a/nemubot/tools/web.py
+++ b/nemubot/tools/web.py
@ -23,13 +23,17 @@ from nemubot.exception import IRCException

 def isURL(url):
    """Return True if the URL can be parsed"""
-    o = urlparse(url)
+    o = urlparse(_getNormalizedURL(url))
    return o.netloc == "" and o.path == ""


+def _getNormalizedURL(url):
+    """Return a light normalized form for the given URL"""
+    return url if "//" in url else "//" + url
+
 def getNormalizedURL(url):
    """Return a normalized form for the given URL"""
-    return urlunsplit(urlsplit(url, "http"))
+    return urlunsplit(urlsplit(_getNormalizedURL(url), "http"))


 def getScheme(url):
@ -40,27 +44,27 @@ def getScheme(url):

 def getHost(url):
    """Return the domain of a given URL"""
-    return urlparse(url, "http").hostname
+    return urlparse(_getNormalizedURL(url), "http").hostname


 def getPort(url):
    """Return the port of a given URL"""
-    return urlparse(url, "http").port
+    return urlparse(_getNormalizedURL(url), "http").port


 def getPath(url):
    """Return the page request of a given URL"""
-    return urlparse(url, "http").path
+    return urlparse(_getNormalizedURL(url), "http").path


 def getUser(url):
    """Return the page request of a given URL"""
-    return urlparse(url, "http").username
+    return urlparse(_getNormalizedURL(url), "http").username


 def getPassword(url):
    """Return the page request of a given URL"""
-    return urlparse(url, "http").password
+    return urlparse(_getNormalizedURL(url), "http").password


 # Get real pages
@ -74,7 +78,7 @@ def getURLContent(url, body=None, timeout=7, header=None):
    timeout -- maximum number of seconds to wait before returning an exception
    """

-    o = urlparse(url, "http")
+    o = urlparse(_getNormalizedURL(url), "http")

    import http.client