tools/web: add a URL normalizer function

This commit is contained in:
nemunaire 2015-10-11 17:04:21 +02:00
parent 7102e08000
commit 20105e7d98
3 changed files with 15 additions and 16 deletions

View File

@ -10,8 +10,6 @@ def isup(url):
""" """
o = urllib.parse.urlparse(url, "http") o = urllib.parse.urlparse(url, "http")
if o.netloc == "":
o = urllib.parse.urlparse("http://" + url)
if o.netloc != "": if o.netloc != "":
isup = getJSON("http://isitup.org/%s.json" % o.netloc) isup = getJSON("http://isitup.org/%s.json" % o.netloc)
if isup is not None and "status_code" in isup and isup["status_code"] == 1: if isup is not None and "status_code" in isup and isup["status_code"] == 1:

View File

@ -13,9 +13,7 @@ def validator(url):
o = urllib.parse.urlparse(url, "http") o = urllib.parse.urlparse(url, "http")
if o.netloc == "": if o.netloc == "":
o = urllib.parse.urlparse("http://" + url) raise IRCException("Indicate a valid URL!")
if o.netloc == "":
raise IRCException("Indiquer une URL valide !")
try: try:
req = urllib.request.Request("http://validator.w3.org/check?uri=%s&output=json" % (urllib.parse.quote(o.geturl())), headers={ 'User-Agent' : "Nemubot v%s" % __version__}) req = urllib.request.Request("http://validator.w3.org/check?uri=%s&output=json" % (urllib.parse.quote(o.geturl())), headers={ 'User-Agent' : "Nemubot v%s" % __version__})

View File

@ -16,7 +16,7 @@
# You should have received a copy of the GNU Affero General Public License # You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>. # along with this program. If not, see <http://www.gnu.org/licenses/>.
from urllib.parse import urlparse from urllib.parse import urlparse, urlsplit, urlunsplit
from nemubot.exception import IRCException from nemubot.exception import IRCException
@ -24,38 +24,43 @@ from nemubot.exception import IRCException
def isURL(url): def isURL(url):
"""Return True if the URL can be parsed""" """Return True if the URL can be parsed"""
o = urlparse(url) o = urlparse(url)
return o.scheme == "" and o.netloc == "" and o.path == "" return o.netloc == "" and o.path == ""
def getNormalizedURL(url):
"""Return a normalized form for the given URL"""
return urlunsplit(urlsplit(url, "http"))
def getScheme(url): def getScheme(url):
"""Return the protocol of a given URL""" """Return the protocol of a given URL"""
o = urlparse(url) o = urlparse(url, "http")
return o.scheme return o.scheme
def getHost(url): def getHost(url):
"""Return the domain of a given URL""" """Return the domain of a given URL"""
return urlparse(url).hostname return urlparse(url, "http").hostname
def getPort(url): def getPort(url):
"""Return the port of a given URL""" """Return the port of a given URL"""
return urlparse(url).port return urlparse(url, "http").port
def getPath(url): def getPath(url):
"""Return the page request of a given URL""" """Return the page request of a given URL"""
return urlparse(url).path return urlparse(url, "http").path
def getUser(url): def getUser(url):
"""Return the page request of a given URL""" """Return the page request of a given URL"""
return urlparse(url).username return urlparse(url, "http").username
def getPassword(url): def getPassword(url):
"""Return the page request of a given URL""" """Return the page request of a given URL"""
return urlparse(url).password return urlparse(url, "http").password
# Get real pages # Get real pages
@ -69,9 +74,7 @@ def getURLContent(url, body=None, timeout=7, header=None):
timeout -- maximum number of seconds to wait before returning an exception timeout -- maximum number of seconds to wait before returning an exception
""" """
o = urlparse(url) o = urlparse(url, "http")
if o.netloc == "":
o = urlparse("http://" + url)
import http.client import http.client