Convert module watchWebsite to nemubot V3.2; can now watch user requested site

This commit is contained in:
Némunaire 2012-09-01 16:16:31 +02:00
parent 07863d0718
commit 5e81905947
5 changed files with 203 additions and 244 deletions

View file

@ -1,2 +1,5 @@
<?xml version="1.0" ?> <?xml version="1.0" ?>
<nemubotmodule name="watchWebsite" /> <nemubotmodule name="watchWebsite">
<message type="cmd" name="watch" call="add_site" />
<message type="cmd" name="unwatch" call="del_site" />
</nemubotmodule>

View file

@ -1,130 +0,0 @@
# coding=utf-8
from datetime import datetime
from datetime import timedelta
import http.client
import hashlib
import socket
import sys
import traceback
from urllib.parse import unquote
from .atom import Atom
class Site:
def __init__(self, item):
self.server = item.getAttribute("server")
self.page = item.getAttribute("page")
if len(self.page) <= 0 or self.page[0] != "/":
self.page = "/" + self.page
if item.hasAttribute("type"):
self.type = item.getAttribute("type")
else:
self.type = "hash"
self.message = item.getAttribute("message")
if item.hasAttribute("time"):
self.updateTime = item.getInt("time")
else:
self.updateTime = 60
self.lastChange = datetime.now()
self.lastpage = None
self.channels = list()
for channel in item.getNodes('channel'):
self.channels.append(channel.getAttribute("name"))
self.categories = dict()
for category in item.getNodes('category'):
self.categories[category.getAttribute("term")] = category.getAttribute("part")
@property
def update(self):
if self.lastpage is None:
return self.lastChange
else:
return self.lastChange + timedelta(seconds=self.updateTime)
@property
def url(self):
return self.server + self.page
def send_message (self, msg):
global SRVS
if len(self.channels) > 0:
for server in SRVS.keys():
for chan in self.channels:
SRVS[server].send_msg (chan, msg)
else:
for server in SRVS.keys():
SRVS[server].send_global (msg)
def treat_atom (self, content):
change=False
f = Atom(content)
if self.lastpage is not None:
diff = self.lastpage.diff (f)
if len(diff) > 0:
print ("[%s] Page differ!"%self.server)
diff.reverse()
for d in diff:
if self.message.count("%s") == 2 and len(self.categories) > 0:
if d.category is None or d.category not in self.categories:
messageI = self.message % (self.categories[""], "%s")
else:
messageI = self.message % (self.categories[d.category], "%s")
self.send_message (messageI % d.link)
elif self.message.count("%s") == 2:
if f.id == youtube.idAtom:
youtube.send_global (d.link2, self.message % (unquote(d.title), d.link))
else:
self.send_message (self.message % (unquote(d.title), d.link))
elif self.message.count("%s") == 1:
self.send_message(self.message % unquote (d.title))
else:
self.send_message(self.message)
change=True
return (f, change)
def check (self):
try:
#print ("Check %s"%(self.url))
(status, content) = getPage(self.server, self.page)
if content is None:
return
if self.type == "atom":
(self.lastpage, change) = self.treat_atom(content)
else:
hash = hashlib.sha224(content).hexdigest()
if hash != self.lastpage:
if self.lastpage is not None:
self.send_message (self.message)
self.lastpage = hash
self.lastChange = datetime.now()
# if self.updateTime < 10:
# self.updateTime = 10
# if self.updateTime > 400:
# self.updateTime = 400
except:
print ("Une erreur est survenue lors de la récupération de la page " + self.server + "/" + self.page)
exc_type, exc_value, exc_traceback = sys.exc_info()
traceback.print_exception(exc_type, exc_value, exc_traceback)
self.updateTime *= 2
def getPage(s, p):
conn = http.client.HTTPConnection(s, timeout=10)
try:
conn.request("GET", p)
res = conn.getresponse()
data = res.read()
except:
print ("[%s] impossible de récupérer la page %s."%(s, p))
return (None, None)
conn.close()
return (res.status, data)

View file

@ -1,46 +0,0 @@
# coding=utf-8
from datetime import datetime
import threading
class Watcher(threading.Thread):
def __init__(self):
self.servers = list()
self.stop = False
self.newSrv = threading.Event()
threading.Thread.__init__(self)
def addServer(self, server):
self.servers.append(server)
self.newSrv.set()
def check(self, closer):
closer.check()
self.newSrv.set()
def run(self):
while not self.stop:
self.newSrv.clear()
closer = None
#Gets the closer server update
for server in self.servers:
if server.update < datetime.now():
#print ("Closer now: %s à %s"%(server.url, server.update))
self.check(server)
elif closer is None or server.update < closer.update:
closer = server
if closer is not None:
#print ("Closer: %s à %s"%(closer.url, closer.update))
timeleft = (closer.update - datetime.now()).seconds
timer = threading.Timer(timeleft, self.check, (closer,))
timer.start()
#print ("Start timer (%ds)"%timeleft)
self.newSrv.wait()
if closer is not None and closer.update is not None and closer.update > datetime.now():
timer.cancel()
def stop(self):
self.stop = True
self.newSrv.set()

View file

@ -1,9 +1,18 @@
# coding=utf-8 # coding=utf-8
nemubotversion = 3.0 from datetime import datetime
from datetime import timedelta
import http.client
import hashlib
import re
import socket
import sys
import traceback
from urllib.parse import unquote
from .Watcher import Watcher from .atom import Atom
from . import Site
nemubotversion = 3.2
def help_tiny (): def help_tiny ():
"""Line inserted in the response to the command !help""" """Line inserted in the response to the command !help"""
@ -12,22 +21,141 @@ def help_tiny ():
def help_full (): def help_full ():
return "This module is autonomous you can't interract with it." return "This module is autonomous you can't interract with it."
CONTEXT = None
WATCHER = None def load(context):
"""Register watched website"""
global CONTEXT
CONTEXT = context
for site in DATAS.getNodes("watch"):
start_watching(site)
def unload(context):
"""Unregister watched website"""
for site in DATAS.getNodes("watch"):
context.del_event(site["evt_id"])
def start_watching(site):
print_debug("Add event for site: http://%s%s" % (site["server"], site["page"]))
evt = ModuleEvent(func=getPage, cmp_data=site["lastcontent"],
func_data=dict(s=site["server"], p=site["page"]),
intervalle=site.getInt("time"),
call=alert_change, call_data=site)
site["evt_id"] = CONTEXT.add_event(evt)
def load(): def explore_url(url):
global WATCHER, DATAS, SRVS return re.match("^(http://)?([^/]+)(/.*)$", url)
#Load the watcher
Site.SRVS = SRVS
WATCHER = Watcher()
for site in DATAS.getNodes("watch"):
s = Site.Site(site)
WATCHER.addServer(s)
WATCHER.start()
def close(): def found_site(s, p):
global WATCHER for site in DATAS:
if WATCHER is not None: if site["server"] == s and site["page"] == p:
WATCHER.stop = True return site
WATCHER.newSrv.set() return None
def del_site(msg):
if len(msg.cmd) <= 1:
return Response(msg.sender, "quel site dois-je arrêter de surveiller ?",
msg.channel, msg.nick)
rx = explore_url(msg.cmd[1])
if rx is not None:
site = found_site(rx.group(2), rx.group(3))
if site is not None and (msg.sender == site["sender"] or msg.is_owner):
CONTEXT.del_event(site["evt_id"])
DATAS.delChild(site)
return Response(msg.sender, "je ne surveille désormais plus cette URL.",
channel=msg.channel, nick=msg.nick)
return Response(msg.sender, "je ne surveillais pas cette URL pour vous.",
channel=msg.channel, nick=msg.nick)
def add_site(msg):
if len(msg.cmd) <= 1:
return Response(msg.sender, "quel site dois-je surveiller ?",
msg.channel, msg.nick)
rx = explore_url(msg.cmd[1])
if rx is None:
return Response(msg.sender, "je ne peux pas surveiller cette URL",
channel=msg.channel, nick=msg.nick)
else:
watch = ModuleState("watch")
watch["sender"] = msg.sender
watch["irc"] = msg.srv.id
watch["channel"] = msg.channel
watch["type"] = "diff"
watch["server"] = rx.group(2)
watch["page"] = rx.group(3)
watch["time"] = 123
watch["message"] = "http://%s%s a changé !" % (watch["server"],
watch["page"])
DATAS.addChild(watch)
start_watching(watch)
save()
return Response(msg.sender, channel=msg.channel, nick=msg.nick
message="ce site est maintenant sous ma surveillance.")
def alert_change(content, site):
"""Alert when a change is detected"""
start_watching(site)
if content is None:
return
if site["type"] == "atom":
if site["_lastpage"] is None:
site["_lastpage"] = Atom(site["lastcontent"])
page = Atom(content)
diff = site["_lastpage"].diff(page)
if len(diff) > 0:
site["_lastpage"] = page
print_debug("[%s] Page differ!" % site["server"])
diff.reverse()
for d in diff:
categories = site.getNodes("categories")
categories.setIndex("term")
if site["message"].count("%s") == 2 and len(categories) > 0:
if d.category is None or d.category not in categories:
messageI = site["message"] % (categories[""], "%s")
else:
messageI = site["message"] % (categories[d.category], "%s")
send_response(site["irc"], Response(site["sender"],
messageI % d.link,
site["channel"]))
elif site["message"].count("%s") == 2:
send_response(site["irc"], Response(site["sender"],
site["message"] % (unquote(d.title), d.link),
site["channel"]))
elif site["message"].count("%s") == 1:
send_response(site["irc"], Response(site["sender"],
site["message"] % unquote (d.title),
site["channel"]))
else:
send_response(site["irc"], Response(site["sender"],
site["message"],
site["channel"]))
else:
return #Stop here, no changes, so don't save
else: # Just looking for any changes
send_response(site["irc"], Response(site["sender"], site["message"], site["channel"]))
site["lastcontent"] = content
save()
#TODO: built-in this function
def getPage(s, p):
"""Return the page content"""
print_debug("Looking http://%s%s"%(s,p))
conn = http.client.HTTPConnection(s, timeout=10)
try:
conn.request("GET", p)
res = conn.getresponse()
data = res.read()
except:
print ("[%s] impossible de récupérer la page %s."%(s, p))
return None
conn.close()
return data.decode()

View file

@ -7,59 +7,63 @@ from xml.dom.minidom import parseString
from xml.dom.minidom import getDOMImplementation from xml.dom.minidom import getDOMImplementation
class AtomEntry: class AtomEntry:
def __init__ (self, node): def __init__ (self, node):
self.id = node.getElementsByTagName("id")[0].firstChild.nodeValue self.id = node.getElementsByTagName("id")[0].firstChild.nodeValue
if node.getElementsByTagName("title")[0].firstChild is not None: if node.getElementsByTagName("title")[0].firstChild is not None:
self.title = node.getElementsByTagName("title")[0].firstChild.nodeValue self.title = node.getElementsByTagName("title")[0].firstChild.nodeValue
else: else:
self.title = "" self.title = ""
try: try:
self.updated = time.strptime(node.getElementsByTagName("updated")[0].firstChild.nodeValue[:19], "%Y-%m-%dT%H:%M:%S") self.updated = time.strptime(node.getElementsByTagName("updated")[0].firstChild.nodeValue[:19], "%Y-%m-%dT%H:%M:%S")
except: except:
try: try:
self.updated = time.strptime(node.getElementsByTagName("updated")[0].firstChild.nodeValue[:10], "%Y-%m-%d") self.updated = time.strptime(node.getElementsByTagName("updated")[0].firstChild.nodeValue[:10], "%Y-%m-%d")
except: except:
print (node.getElementsByTagName("updated")[0].firstChild.nodeValue[:10]) print (node.getElementsByTagName("updated")[0].firstChild.nodeValue[:10])
self.updated = time.localtime () self.updated = time.localtime ()
if len(node.getElementsByTagName("summary")) > 0 and node.getElementsByTagName("summary")[0].firstChild is not None: if len(node.getElementsByTagName("summary")) > 0 and node.getElementsByTagName("summary")[0].firstChild is not None:
self.summary = node.getElementsByTagName("summary")[0].firstChild.nodeValue self.summary = node.getElementsByTagName("summary")[0].firstChild.nodeValue
else: else:
self.summary = None self.summary = None
if len(node.getElementsByTagName("link")) > 0: if len(node.getElementsByTagName("link")) > 0:
self.link = node.getElementsByTagName("link")[0].getAttribute ("href") self.link = node.getElementsByTagName("link")[0].getAttribute ("href")
else: else:
self.link = None self.link = None
if len (node.getElementsByTagName("category")) >= 1: if len (node.getElementsByTagName("category")) >= 1:
self.category = node.getElementsByTagName("category")[0].getAttribute ("term") self.category = node.getElementsByTagName("category")[0].getAttribute ("term")
else: else:
self.category = None self.category = None
if len (node.getElementsByTagName("link")) > 1: if len (node.getElementsByTagName("link")) > 1:
self.link2 = node.getElementsByTagName("link")[1].getAttribute ("href") self.link2 = node.getElementsByTagName("link")[1].getAttribute ("href")
else: else:
self.link2 = None self.link2 = None
class Atom: class Atom:
def __init__ (self, string): def __init__ (self, string):
self.feed = parseString (string).documentElement self.raw = string
self.id = self.feed.getElementsByTagName("id")[0].firstChild.nodeValue self.feed = parseString (string).documentElement
self.title = self.feed.getElementsByTagName("title")[0].firstChild.nodeValue self.id = self.feed.getElementsByTagName("id")[0].firstChild.nodeValue
self.title = self.feed.getElementsByTagName("title")[0].firstChild.nodeValue
self.updated = None self.updated = None
self.entries = dict () self.entries = dict ()
for item in self.feed.getElementsByTagName("entry"): for item in self.feed.getElementsByTagName("entry"):
entry = AtomEntry (item) entry = AtomEntry (item)
self.entries[entry.id] = entry self.entries[entry.id] = entry
if self.updated is None or self.updated < entry.updated: if self.updated is None or self.updated < entry.updated:
self.updated = entry.updated self.updated = entry.updated
def diff (self, other): def __str__(self):
differ = list () return self.raw
for k in other.entries.keys ():
if self.updated is None and k not in self.entries: def diff (self, other):
self.updated = other.entries[k].updated differ = list ()
if k not in self.entries and other.entries[k].updated >= self.updated: for k in other.entries.keys ():
differ.append (other.entries[k]) if self.updated is None and k not in self.entries:
return differ self.updated = other.entries[k].updated
if k not in self.entries and other.entries[k].updated >= self.updated:
differ.append (other.entries[k])
return differ
if __name__ == "__main__": if __name__ == "__main__":