tools/web: factorize getNormalizedURL

This commit is contained in:
nemunaire 2015-10-14 00:17:02 +02:00
parent 76ec0d26b4
commit 55e6550cb1
7 changed files with 26 additions and 21 deletions

View File

@ -34,7 +34,7 @@ PROVIDERS = {
}
DEFAULT_PROVIDER = "framalink"
PROVIDERS_NETLOC = [urlparse(url, "http").netloc for f, url in PROVIDERS.values()]
PROVIDERS_NETLOC = [urlparse(web.getNormalizedURL(url), "http").netloc for f, url in PROVIDERS.values()]
# LOADING #############################################################
@ -82,7 +82,7 @@ def parseresponse(msg):
if hasattr(msg, "text") and msg.text:
urls = re.findall("([a-zA-Z0-9+.-]+:(?://)?[^ :]+)", msg.text)
for url in urls:
o = urlparse(url, "http")
o = urlparse(web._getNormalizedURL(url), "http")
# Skip short URLs
if o.netloc == "" or o.netloc in PROVIDERS or len(o.netloc) + len(o.path) < 17:
@ -118,7 +118,7 @@ def cmd_reduceurl(msg):
res = list()
for url in minify:
o = urlparse(url, "http")
o = urlparse(web.getNormalizedURL(url), "http")
minief_url = reduce(url)
if o.netloc == "":
res.append(gen_response(minief_url, msg, o.scheme))

View File

@ -1,6 +1,6 @@
import urllib
from nemubot.tools.web import getJSON
from nemubot.tools.web import getNormalizedURL, getJSON
def isup(url):
"""Determine if the given URL is up or not
@ -9,7 +9,7 @@ def isup(url):
url -- the URL to check
"""
o = urllib.parse.urlparse(url, "http")
o = urllib.parse.urlparse(getNormalizedURL(url), "http")
if o.netloc != "":
isup = getJSON("http://isitup.org/%s.json" % o.netloc)
if isup is not None and "status_code" in isup and isup["status_code"] == 1:

View File

@ -21,7 +21,7 @@ def headers(url):
url -- the page URL to get header
"""
o = urllib.parse.urlparse(url, "http")
o = urllib.parse.urlparse(web.getNormalizedURL(url), "http")
if o.netloc == "":
raise IRCException("invalid URL")
if o.scheme == "http":

View File

@ -3,6 +3,7 @@ import urllib
from nemubot import __version__
from nemubot.exception import IRCException
from nemubot.tools.web import getNormalizedURL
def validator(url):
"""Run the w3c validator on the given URL
@ -11,7 +12,7 @@ def validator(url):
url -- the URL to validate
"""
o = urllib.parse.urlparse(url, "http")
o = urllib.parse.urlparse(getNormalizedURL(url), "http")
if o.netloc == "":
raise IRCException("Indicate a valid URL!")

View File

@ -8,10 +8,10 @@ from urllib.parse import urlparse
from nemubot.event import ModuleEvent
from nemubot.exception import IRCException
from nemubot.hooks import hook
from nemubot.tools.web import getNormalizedURL
from nemubot.tools.xmlparser.node import ModuleState
logger = logging.getLogger("nemubot.module.networking.watchWebsite")
nemubotversion = 3.4
from more import Response
@ -56,7 +56,7 @@ def del_site(url, nick, channel, frm_owner):
url -- URL to unwatch
"""
o = urlparse(url, "http")
o = urlparse(getNormalizedURL(url), "http")
if o.scheme != "" and url in DATAS.index:
site = DATAS.index[url]
for a in site.getNodes("alert"):
@ -80,7 +80,7 @@ def add_site(url, nick, channel, server, diffType="diff"):
url -- URL to watch
"""
o = urlparse(url, "http")
o = urlparse(getNormalizedURL(url), "http")
if o.netloc == "":
raise IRCException("sorry, I can't watch this URL :(")
@ -210,7 +210,7 @@ def start_watching(site, offset=0):
offset -- offset time to delay the launch of the first check
"""
o = urlparse(site["url"], "http")
o = urlparse(getNormalizedURL(site["url"]), "http")
#print_debug("Add %s event for site: %s" % (site["type"], o.netloc))
try:

View File

@ -3,7 +3,7 @@ import re, json, subprocess
from nemubot.exception import IRCException
from nemubot.hooks import hook
from nemubot.tools.web import getURLContent
from nemubot.tools.web import _getNormalizedURL, getURLContent
from more import Response
"""Get information of youtube videos"""
@ -85,7 +85,7 @@ def parseresponse(msg):
if hasattr(msg, "text") and msg.text:
urls = re.findall("([a-zA-Z0-9+.-]+:(?://)?[^ :]+)", msg.text)
for url in urls:
o = urlparse(url)
o = urlparse(_getNormalizedURL(url))
if o.scheme != "":
if o.netloc == "" and len(o.path) < 10:
continue

View File

@ -23,13 +23,17 @@ from nemubot.exception import IRCException
def isURL(url):
"""Return True if the URL can be parsed"""
o = urlparse(url)
o = urlparse(_getNormalizedURL(url))
return o.netloc == "" and o.path == ""
def _getNormalizedURL(url):
"""Return a light normalized form for the given URL"""
return url if "//" in url else "//" + url
def getNormalizedURL(url):
"""Return a normalized form for the given URL"""
return urlunsplit(urlsplit(url, "http"))
return urlunsplit(urlsplit(_getNormalizedURL(url), "http"))
def getScheme(url):
@ -40,27 +44,27 @@ def getScheme(url):
def getHost(url):
"""Return the domain of a given URL"""
return urlparse(url, "http").hostname
return urlparse(_getNormalizedURL(url), "http").hostname
def getPort(url):
"""Return the port of a given URL"""
return urlparse(url, "http").port
return urlparse(_getNormalizedURL(url), "http").port
def getPath(url):
"""Return the page request of a given URL"""
return urlparse(url, "http").path
return urlparse(_getNormalizedURL(url), "http").path
def getUser(url):
"""Return the page request of a given URL"""
return urlparse(url, "http").username
return urlparse(_getNormalizedURL(url), "http").username
def getPassword(url):
"""Return the page request of a given URL"""
return urlparse(url, "http").password
return urlparse(_getNormalizedURL(url), "http").password
# Get real pages
@ -74,7 +78,7 @@ def getURLContent(url, body=None, timeout=7, header=None):
timeout -- maximum number of seconds to wait before returning an exception
"""
o = urlparse(url, "http")
o = urlparse(_getNormalizedURL(url), "http")
import http.client