tools/web: factorize getNormalizedURL

This commit is contained in:
nemunaire 2015-10-14 00:17:02 +02:00
parent 76ec0d26b4
commit 55e6550cb1
7 changed files with 26 additions and 21 deletions

View File

@ -34,7 +34,7 @@ PROVIDERS = {
} }
DEFAULT_PROVIDER = "framalink" DEFAULT_PROVIDER = "framalink"
PROVIDERS_NETLOC = [urlparse(url, "http").netloc for f, url in PROVIDERS.values()] PROVIDERS_NETLOC = [urlparse(web.getNormalizedURL(url), "http").netloc for f, url in PROVIDERS.values()]
# LOADING ############################################################# # LOADING #############################################################
@ -82,7 +82,7 @@ def parseresponse(msg):
if hasattr(msg, "text") and msg.text: if hasattr(msg, "text") and msg.text:
urls = re.findall("([a-zA-Z0-9+.-]+:(?://)?[^ :]+)", msg.text) urls = re.findall("([a-zA-Z0-9+.-]+:(?://)?[^ :]+)", msg.text)
for url in urls: for url in urls:
o = urlparse(url, "http") o = urlparse(web._getNormalizedURL(url), "http")
# Skip short URLs # Skip short URLs
if o.netloc == "" or o.netloc in PROVIDERS or len(o.netloc) + len(o.path) < 17: if o.netloc == "" or o.netloc in PROVIDERS or len(o.netloc) + len(o.path) < 17:
@ -118,7 +118,7 @@ def cmd_reduceurl(msg):
res = list() res = list()
for url in minify: for url in minify:
o = urlparse(url, "http") o = urlparse(web.getNormalizedURL(url), "http")
minief_url = reduce(url) minief_url = reduce(url)
if o.netloc == "": if o.netloc == "":
res.append(gen_response(minief_url, msg, o.scheme)) res.append(gen_response(minief_url, msg, o.scheme))

View File

@ -1,6 +1,6 @@
import urllib import urllib
from nemubot.tools.web import getJSON from nemubot.tools.web import getNormalizedURL, getJSON
def isup(url): def isup(url):
"""Determine if the given URL is up or not """Determine if the given URL is up or not
@ -9,7 +9,7 @@ def isup(url):
url -- the URL to check url -- the URL to check
""" """
o = urllib.parse.urlparse(url, "http") o = urllib.parse.urlparse(getNormalizedURL(url), "http")
if o.netloc != "": if o.netloc != "":
isup = getJSON("http://isitup.org/%s.json" % o.netloc) isup = getJSON("http://isitup.org/%s.json" % o.netloc)
if isup is not None and "status_code" in isup and isup["status_code"] == 1: if isup is not None and "status_code" in isup and isup["status_code"] == 1:

View File

@ -21,7 +21,7 @@ def headers(url):
url -- the page URL to get header url -- the page URL to get header
""" """
o = urllib.parse.urlparse(url, "http") o = urllib.parse.urlparse(web.getNormalizedURL(url), "http")
if o.netloc == "": if o.netloc == "":
raise IRCException("invalid URL") raise IRCException("invalid URL")
if o.scheme == "http": if o.scheme == "http":

View File

@ -3,6 +3,7 @@ import urllib
from nemubot import __version__ from nemubot import __version__
from nemubot.exception import IRCException from nemubot.exception import IRCException
from nemubot.tools.web import getNormalizedURL
def validator(url): def validator(url):
"""Run the w3c validator on the given URL """Run the w3c validator on the given URL
@ -11,7 +12,7 @@ def validator(url):
url -- the URL to validate url -- the URL to validate
""" """
o = urllib.parse.urlparse(url, "http") o = urllib.parse.urlparse(getNormalizedURL(url), "http")
if o.netloc == "": if o.netloc == "":
raise IRCException("Indicate a valid URL!") raise IRCException("Indicate a valid URL!")

View File

@ -8,10 +8,10 @@ from urllib.parse import urlparse
from nemubot.event import ModuleEvent from nemubot.event import ModuleEvent
from nemubot.exception import IRCException from nemubot.exception import IRCException
from nemubot.hooks import hook from nemubot.hooks import hook
from nemubot.tools.web import getNormalizedURL
from nemubot.tools.xmlparser.node import ModuleState from nemubot.tools.xmlparser.node import ModuleState
logger = logging.getLogger("nemubot.module.networking.watchWebsite") logger = logging.getLogger("nemubot.module.networking.watchWebsite")
nemubotversion = 3.4
from more import Response from more import Response
@ -56,7 +56,7 @@ def del_site(url, nick, channel, frm_owner):
url -- URL to unwatch url -- URL to unwatch
""" """
o = urlparse(url, "http") o = urlparse(getNormalizedURL(url), "http")
if o.scheme != "" and url in DATAS.index: if o.scheme != "" and url in DATAS.index:
site = DATAS.index[url] site = DATAS.index[url]
for a in site.getNodes("alert"): for a in site.getNodes("alert"):
@ -80,7 +80,7 @@ def add_site(url, nick, channel, server, diffType="diff"):
url -- URL to watch url -- URL to watch
""" """
o = urlparse(url, "http") o = urlparse(getNormalizedURL(url), "http")
if o.netloc == "": if o.netloc == "":
raise IRCException("sorry, I can't watch this URL :(") raise IRCException("sorry, I can't watch this URL :(")
@ -210,7 +210,7 @@ def start_watching(site, offset=0):
offset -- offset time to delay the launch of the first check offset -- offset time to delay the launch of the first check
""" """
o = urlparse(site["url"], "http") o = urlparse(getNormalizedURL(site["url"]), "http")
#print_debug("Add %s event for site: %s" % (site["type"], o.netloc)) #print_debug("Add %s event for site: %s" % (site["type"], o.netloc))
try: try:

View File

@ -3,7 +3,7 @@ import re, json, subprocess
from nemubot.exception import IRCException from nemubot.exception import IRCException
from nemubot.hooks import hook from nemubot.hooks import hook
from nemubot.tools.web import getURLContent from nemubot.tools.web import _getNormalizedURL, getURLContent
from more import Response from more import Response
"""Get information of youtube videos""" """Get information of youtube videos"""
@ -85,7 +85,7 @@ def parseresponse(msg):
if hasattr(msg, "text") and msg.text: if hasattr(msg, "text") and msg.text:
urls = re.findall("([a-zA-Z0-9+.-]+:(?://)?[^ :]+)", msg.text) urls = re.findall("([a-zA-Z0-9+.-]+:(?://)?[^ :]+)", msg.text)
for url in urls: for url in urls:
o = urlparse(url) o = urlparse(_getNormalizedURL(url))
if o.scheme != "": if o.scheme != "":
if o.netloc == "" and len(o.path) < 10: if o.netloc == "" and len(o.path) < 10:
continue continue

View File

@ -23,13 +23,17 @@ from nemubot.exception import IRCException
def isURL(url): def isURL(url):
"""Return True if the URL can be parsed""" """Return True if the URL can be parsed"""
o = urlparse(url) o = urlparse(_getNormalizedURL(url))
return o.netloc == "" and o.path == "" return o.netloc == "" and o.path == ""
def _getNormalizedURL(url):
"""Return a light normalized form for the given URL"""
return url if "//" in url else "//" + url
def getNormalizedURL(url): def getNormalizedURL(url):
"""Return a normalized form for the given URL""" """Return a normalized form for the given URL"""
return urlunsplit(urlsplit(url, "http")) return urlunsplit(urlsplit(_getNormalizedURL(url), "http"))
def getScheme(url): def getScheme(url):
@ -40,27 +44,27 @@ def getScheme(url):
def getHost(url): def getHost(url):
"""Return the domain of a given URL""" """Return the domain of a given URL"""
return urlparse(url, "http").hostname return urlparse(_getNormalizedURL(url), "http").hostname
def getPort(url): def getPort(url):
"""Return the port of a given URL""" """Return the port of a given URL"""
return urlparse(url, "http").port return urlparse(_getNormalizedURL(url), "http").port
def getPath(url): def getPath(url):
"""Return the page request of a given URL""" """Return the page request of a given URL"""
return urlparse(url, "http").path return urlparse(_getNormalizedURL(url), "http").path
def getUser(url): def getUser(url):
"""Return the page request of a given URL""" """Return the page request of a given URL"""
return urlparse(url, "http").username return urlparse(_getNormalizedURL(url), "http").username
def getPassword(url): def getPassword(url):
"""Return the page request of a given URL""" """Return the page request of a given URL"""
return urlparse(url, "http").password return urlparse(_getNormalizedURL(url), "http").password
# Get real pages # Get real pages
@ -74,7 +78,7 @@ def getURLContent(url, body=None, timeout=7, header=None):
timeout -- maximum number of seconds to wait before returning an exception timeout -- maximum number of seconds to wait before returning an exception
""" """
o = urlparse(url, "http") o = urlparse(_getNormalizedURL(url), "http")
import http.client import http.client