diff --git a/modules/watchWebsite.xml b/modules/watchWebsite.xml index f68c219..ed391b5 100644 --- a/modules/watchWebsite.xml +++ b/modules/watchWebsite.xml @@ -1,2 +1,5 @@ - \ No newline at end of file + + + + diff --git a/modules/watchWebsite/Site.py b/modules/watchWebsite/Site.py deleted file mode 100644 index 7d68961..0000000 --- a/modules/watchWebsite/Site.py +++ /dev/null @@ -1,130 +0,0 @@ -# coding=utf-8 - -from datetime import datetime -from datetime import timedelta -import http.client -import hashlib -import socket -import sys -import traceback -from urllib.parse import unquote - -from .atom import Atom - -class Site: - def __init__(self, item): - self.server = item.getAttribute("server") - self.page = item.getAttribute("page") - if len(self.page) <= 0 or self.page[0] != "/": - self.page = "/" + self.page - if item.hasAttribute("type"): - self.type = item.getAttribute("type") - else: - self.type = "hash" - self.message = item.getAttribute("message") - - if item.hasAttribute("time"): - self.updateTime = item.getInt("time") - else: - self.updateTime = 60 - self.lastChange = datetime.now() - self.lastpage = None - - self.channels = list() - for channel in item.getNodes('channel'): - self.channels.append(channel.getAttribute("name")) - - self.categories = dict() - for category in item.getNodes('category'): - self.categories[category.getAttribute("term")] = category.getAttribute("part") - - @property - def update(self): - if self.lastpage is None: - return self.lastChange - else: - return self.lastChange + timedelta(seconds=self.updateTime) - - @property - def url(self): - return self.server + self.page - - def send_message (self, msg): - global SRVS - if len(self.channels) > 0: - for server in SRVS.keys(): - for chan in self.channels: - SRVS[server].send_msg (chan, msg) - else: - for server in SRVS.keys(): - SRVS[server].send_global (msg) - - def treat_atom (self, content): - change=False - f = Atom(content) - if self.lastpage is not None: - diff = self.lastpage.diff (f) - if len(diff) > 0: - print ("[%s] Page differ!"%self.server) - diff.reverse() - for d in diff: - if self.message.count("%s") == 2 and len(self.categories) > 0: - if d.category is None or d.category not in self.categories: - messageI = self.message % (self.categories[""], "%s") - else: - messageI = self.message % (self.categories[d.category], "%s") - self.send_message (messageI % d.link) - elif self.message.count("%s") == 2: - if f.id == youtube.idAtom: - youtube.send_global (d.link2, self.message % (unquote(d.title), d.link)) - else: - self.send_message (self.message % (unquote(d.title), d.link)) - elif self.message.count("%s") == 1: - self.send_message(self.message % unquote (d.title)) - else: - self.send_message(self.message) - change=True - return (f, change) - - def check (self): - try: - #print ("Check %s"%(self.url)) - (status, content) = getPage(self.server, self.page) - if content is None: - return - - if self.type == "atom": - (self.lastpage, change) = self.treat_atom(content) - else: - hash = hashlib.sha224(content).hexdigest() - if hash != self.lastpage: - if self.lastpage is not None: - self.send_message (self.message) - self.lastpage = hash - - self.lastChange = datetime.now() - -# if self.updateTime < 10: -# self.updateTime = 10 -# if self.updateTime > 400: -# self.updateTime = 400 - except: - print ("Une erreur est survenue lors de la récupération de la page " + self.server + "/" + self.page) - exc_type, exc_value, exc_traceback = sys.exc_info() - traceback.print_exception(exc_type, exc_value, exc_traceback) - self.updateTime *= 2 - - -def getPage(s, p): - conn = http.client.HTTPConnection(s, timeout=10) - try: - conn.request("GET", p) - - res = conn.getresponse() - data = res.read() - except: - print ("[%s] impossible de récupérer la page %s."%(s, p)) - return (None, None) - - conn.close() - return (res.status, data) diff --git a/modules/watchWebsite/Watcher.py b/modules/watchWebsite/Watcher.py deleted file mode 100644 index bc77192..0000000 --- a/modules/watchWebsite/Watcher.py +++ /dev/null @@ -1,46 +0,0 @@ -# coding=utf-8 - -from datetime import datetime -import threading - -class Watcher(threading.Thread): - def __init__(self): - self.servers = list() - self.stop = False - self.newSrv = threading.Event() - threading.Thread.__init__(self) - - def addServer(self, server): - self.servers.append(server) - self.newSrv.set() - - def check(self, closer): - closer.check() - self.newSrv.set() - - def run(self): - while not self.stop: - self.newSrv.clear() - closer = None - #Gets the closer server update - for server in self.servers: - if server.update < datetime.now(): - #print ("Closer now: %s à %s"%(server.url, server.update)) - self.check(server) - elif closer is None or server.update < closer.update: - closer = server - if closer is not None: - #print ("Closer: %s à %s"%(closer.url, closer.update)) - timeleft = (closer.update - datetime.now()).seconds - timer = threading.Timer(timeleft, self.check, (closer,)) - timer.start() - #print ("Start timer (%ds)"%timeleft) - - self.newSrv.wait() - - if closer is not None and closer.update is not None and closer.update > datetime.now(): - timer.cancel() - - def stop(self): - self.stop = True - self.newSrv.set() diff --git a/modules/watchWebsite/__init__.py b/modules/watchWebsite/__init__.py index 6842007..8e2c79c 100644 --- a/modules/watchWebsite/__init__.py +++ b/modules/watchWebsite/__init__.py @@ -1,9 +1,18 @@ # coding=utf-8 -nemubotversion = 3.0 +from datetime import datetime +from datetime import timedelta +import http.client +import hashlib +import re +import socket +import sys +import traceback +from urllib.parse import unquote -from .Watcher import Watcher -from . import Site +from .atom import Atom + +nemubotversion = 3.2 def help_tiny (): """Line inserted in the response to the command !help""" @@ -12,22 +21,141 @@ def help_tiny (): def help_full (): return "This module is autonomous you can't interract with it." +CONTEXT = None -WATCHER = None +def load(context): + """Register watched website""" + global CONTEXT + CONTEXT = context + for site in DATAS.getNodes("watch"): + start_watching(site) + +def unload(context): + """Unregister watched website""" + for site in DATAS.getNodes("watch"): + context.del_event(site["evt_id"]) + +def start_watching(site): + print_debug("Add event for site: http://%s%s" % (site["server"], site["page"])) + evt = ModuleEvent(func=getPage, cmp_data=site["lastcontent"], + func_data=dict(s=site["server"], p=site["page"]), + intervalle=site.getInt("time"), + call=alert_change, call_data=site) + site["evt_id"] = CONTEXT.add_event(evt) -def load(): - global WATCHER, DATAS, SRVS - #Load the watcher - Site.SRVS = SRVS - WATCHER = Watcher() - for site in DATAS.getNodes("watch"): - s = Site.Site(site) - WATCHER.addServer(s) - WATCHER.start() +def explore_url(url): + return re.match("^(http://)?([^/]+)(/.*)$", url) -def close(): - global WATCHER - if WATCHER is not None: - WATCHER.stop = True - WATCHER.newSrv.set() +def found_site(s, p): + for site in DATAS: + if site["server"] == s and site["page"] == p: + return site + return None + +def del_site(msg): + if len(msg.cmd) <= 1: + return Response(msg.sender, "quel site dois-je arrêter de surveiller ?", + msg.channel, msg.nick) + + rx = explore_url(msg.cmd[1]) + if rx is not None: + site = found_site(rx.group(2), rx.group(3)) + if site is not None and (msg.sender == site["sender"] or msg.is_owner): + CONTEXT.del_event(site["evt_id"]) + DATAS.delChild(site) + return Response(msg.sender, "je ne surveille désormais plus cette URL.", + channel=msg.channel, nick=msg.nick) + return Response(msg.sender, "je ne surveillais pas cette URL pour vous.", + channel=msg.channel, nick=msg.nick) + +def add_site(msg): + if len(msg.cmd) <= 1: + return Response(msg.sender, "quel site dois-je surveiller ?", + msg.channel, msg.nick) + + rx = explore_url(msg.cmd[1]) + if rx is None: + return Response(msg.sender, "je ne peux pas surveiller cette URL", + channel=msg.channel, nick=msg.nick) + else: + watch = ModuleState("watch") + watch["sender"] = msg.sender + watch["irc"] = msg.srv.id + watch["channel"] = msg.channel + watch["type"] = "diff" + watch["server"] = rx.group(2) + watch["page"] = rx.group(3) + watch["time"] = 123 + watch["message"] = "http://%s%s a changé !" % (watch["server"], + watch["page"]) + DATAS.addChild(watch) + start_watching(watch) + + save() + return Response(msg.sender, channel=msg.channel, nick=msg.nick + message="ce site est maintenant sous ma surveillance.") + +def alert_change(content, site): + """Alert when a change is detected""" + start_watching(site) + if content is None: + return + + if site["type"] == "atom": + if site["_lastpage"] is None: + site["_lastpage"] = Atom(site["lastcontent"]) + page = Atom(content) + diff = site["_lastpage"].diff(page) + if len(diff) > 0: + site["_lastpage"] = page + print_debug("[%s] Page differ!" % site["server"]) + diff.reverse() + for d in diff: + categories = site.getNodes("categories") + categories.setIndex("term") + + if site["message"].count("%s") == 2 and len(categories) > 0: + if d.category is None or d.category not in categories: + messageI = site["message"] % (categories[""], "%s") + else: + messageI = site["message"] % (categories[d.category], "%s") + send_response(site["irc"], Response(site["sender"], + messageI % d.link, + site["channel"])) + elif site["message"].count("%s") == 2: + send_response(site["irc"], Response(site["sender"], + site["message"] % (unquote(d.title), d.link), + site["channel"])) + elif site["message"].count("%s") == 1: + send_response(site["irc"], Response(site["sender"], + site["message"] % unquote (d.title), + site["channel"])) + else: + send_response(site["irc"], Response(site["sender"], + site["message"], + site["channel"])) + else: + return #Stop here, no changes, so don't save + + else: # Just looking for any changes + send_response(site["irc"], Response(site["sender"], site["message"], site["channel"])) + site["lastcontent"] = content + save() + +#TODO: built-in this function +def getPage(s, p): + """Return the page content""" + print_debug("Looking http://%s%s"%(s,p)) + conn = http.client.HTTPConnection(s, timeout=10) + try: + conn.request("GET", p) + + res = conn.getresponse() + data = res.read() + except: + print ("[%s] impossible de récupérer la page %s."%(s, p)) + return None + + conn.close() + return data.decode() diff --git a/modules/watchWebsite/atom.py b/modules/watchWebsite/atom.py index 8915b40..30272e0 100755 --- a/modules/watchWebsite/atom.py +++ b/modules/watchWebsite/atom.py @@ -7,59 +7,63 @@ from xml.dom.minidom import parseString from xml.dom.minidom import getDOMImplementation class AtomEntry: - def __init__ (self, node): - self.id = node.getElementsByTagName("id")[0].firstChild.nodeValue - if node.getElementsByTagName("title")[0].firstChild is not None: - self.title = node.getElementsByTagName("title")[0].firstChild.nodeValue - else: - self.title = "" - try: - self.updated = time.strptime(node.getElementsByTagName("updated")[0].firstChild.nodeValue[:19], "%Y-%m-%dT%H:%M:%S") - except: - try: - self.updated = time.strptime(node.getElementsByTagName("updated")[0].firstChild.nodeValue[:10], "%Y-%m-%d") - except: - print (node.getElementsByTagName("updated")[0].firstChild.nodeValue[:10]) - self.updated = time.localtime () - if len(node.getElementsByTagName("summary")) > 0 and node.getElementsByTagName("summary")[0].firstChild is not None: - self.summary = node.getElementsByTagName("summary")[0].firstChild.nodeValue - else: - self.summary = None - if len(node.getElementsByTagName("link")) > 0: - self.link = node.getElementsByTagName("link")[0].getAttribute ("href") - else: - self.link = None - if len (node.getElementsByTagName("category")) >= 1: - self.category = node.getElementsByTagName("category")[0].getAttribute ("term") - else: - self.category = None - if len (node.getElementsByTagName("link")) > 1: - self.link2 = node.getElementsByTagName("link")[1].getAttribute ("href") - else: - self.link2 = None + def __init__ (self, node): + self.id = node.getElementsByTagName("id")[0].firstChild.nodeValue + if node.getElementsByTagName("title")[0].firstChild is not None: + self.title = node.getElementsByTagName("title")[0].firstChild.nodeValue + else: + self.title = "" + try: + self.updated = time.strptime(node.getElementsByTagName("updated")[0].firstChild.nodeValue[:19], "%Y-%m-%dT%H:%M:%S") + except: + try: + self.updated = time.strptime(node.getElementsByTagName("updated")[0].firstChild.nodeValue[:10], "%Y-%m-%d") + except: + print (node.getElementsByTagName("updated")[0].firstChild.nodeValue[:10]) + self.updated = time.localtime () + if len(node.getElementsByTagName("summary")) > 0 and node.getElementsByTagName("summary")[0].firstChild is not None: + self.summary = node.getElementsByTagName("summary")[0].firstChild.nodeValue + else: + self.summary = None + if len(node.getElementsByTagName("link")) > 0: + self.link = node.getElementsByTagName("link")[0].getAttribute ("href") + else: + self.link = None + if len (node.getElementsByTagName("category")) >= 1: + self.category = node.getElementsByTagName("category")[0].getAttribute ("term") + else: + self.category = None + if len (node.getElementsByTagName("link")) > 1: + self.link2 = node.getElementsByTagName("link")[1].getAttribute ("href") + else: + self.link2 = None class Atom: - def __init__ (self, string): - self.feed = parseString (string).documentElement - self.id = self.feed.getElementsByTagName("id")[0].firstChild.nodeValue - self.title = self.feed.getElementsByTagName("title")[0].firstChild.nodeValue + def __init__ (self, string): + self.raw = string + self.feed = parseString (string).documentElement + self.id = self.feed.getElementsByTagName("id")[0].firstChild.nodeValue + self.title = self.feed.getElementsByTagName("title")[0].firstChild.nodeValue - self.updated = None - self.entries = dict () - for item in self.feed.getElementsByTagName("entry"): - entry = AtomEntry (item) - self.entries[entry.id] = entry - if self.updated is None or self.updated < entry.updated: - self.updated = entry.updated + self.updated = None + self.entries = dict () + for item in self.feed.getElementsByTagName("entry"): + entry = AtomEntry (item) + self.entries[entry.id] = entry + if self.updated is None or self.updated < entry.updated: + self.updated = entry.updated - def diff (self, other): - differ = list () - for k in other.entries.keys (): - if self.updated is None and k not in self.entries: - self.updated = other.entries[k].updated - if k not in self.entries and other.entries[k].updated >= self.updated: - differ.append (other.entries[k]) - return differ + def __str__(self): + return self.raw + + def diff (self, other): + differ = list () + for k in other.entries.keys (): + if self.updated is None and k not in self.entries: + self.updated = other.entries[k].updated + if k not in self.entries and other.entries[k].updated >= self.updated: + differ.append (other.entries[k]) + return differ if __name__ == "__main__":