Convert module watchWebsite to nemubot V3.2; can now watch user requested site

This commit is contained in:
Némunaire 2012-09-01 16:16:31 +02:00
parent 07863d0718
commit 5e81905947
5 changed files with 203 additions and 244 deletions

View File

@ -1,2 +1,5 @@
<?xml version="1.0" ?>
<nemubotmodule name="watchWebsite" />
<nemubotmodule name="watchWebsite">
<message type="cmd" name="watch" call="add_site" />
<message type="cmd" name="unwatch" call="del_site" />
</nemubotmodule>

View File

@ -1,130 +0,0 @@
# coding=utf-8
from datetime import datetime
from datetime import timedelta
import http.client
import hashlib
import socket
import sys
import traceback
from urllib.parse import unquote
from .atom import Atom
class Site:
def __init__(self, item):
self.server = item.getAttribute("server")
self.page = item.getAttribute("page")
if len(self.page) <= 0 or self.page[0] != "/":
self.page = "/" + self.page
if item.hasAttribute("type"):
self.type = item.getAttribute("type")
else:
self.type = "hash"
self.message = item.getAttribute("message")
if item.hasAttribute("time"):
self.updateTime = item.getInt("time")
else:
self.updateTime = 60
self.lastChange = datetime.now()
self.lastpage = None
self.channels = list()
for channel in item.getNodes('channel'):
self.channels.append(channel.getAttribute("name"))
self.categories = dict()
for category in item.getNodes('category'):
self.categories[category.getAttribute("term")] = category.getAttribute("part")
@property
def update(self):
if self.lastpage is None:
return self.lastChange
else:
return self.lastChange + timedelta(seconds=self.updateTime)
@property
def url(self):
return self.server + self.page
def send_message (self, msg):
global SRVS
if len(self.channels) > 0:
for server in SRVS.keys():
for chan in self.channels:
SRVS[server].send_msg (chan, msg)
else:
for server in SRVS.keys():
SRVS[server].send_global (msg)
def treat_atom (self, content):
change=False
f = Atom(content)
if self.lastpage is not None:
diff = self.lastpage.diff (f)
if len(diff) > 0:
print ("[%s] Page differ!"%self.server)
diff.reverse()
for d in diff:
if self.message.count("%s") == 2 and len(self.categories) > 0:
if d.category is None or d.category not in self.categories:
messageI = self.message % (self.categories[""], "%s")
else:
messageI = self.message % (self.categories[d.category], "%s")
self.send_message (messageI % d.link)
elif self.message.count("%s") == 2:
if f.id == youtube.idAtom:
youtube.send_global (d.link2, self.message % (unquote(d.title), d.link))
else:
self.send_message (self.message % (unquote(d.title), d.link))
elif self.message.count("%s") == 1:
self.send_message(self.message % unquote (d.title))
else:
self.send_message(self.message)
change=True
return (f, change)
def check (self):
try:
#print ("Check %s"%(self.url))
(status, content) = getPage(self.server, self.page)
if content is None:
return
if self.type == "atom":
(self.lastpage, change) = self.treat_atom(content)
else:
hash = hashlib.sha224(content).hexdigest()
if hash != self.lastpage:
if self.lastpage is not None:
self.send_message (self.message)
self.lastpage = hash
self.lastChange = datetime.now()
# if self.updateTime < 10:
# self.updateTime = 10
# if self.updateTime > 400:
# self.updateTime = 400
except:
print ("Une erreur est survenue lors de la récupération de la page " + self.server + "/" + self.page)
exc_type, exc_value, exc_traceback = sys.exc_info()
traceback.print_exception(exc_type, exc_value, exc_traceback)
self.updateTime *= 2
def getPage(s, p):
conn = http.client.HTTPConnection(s, timeout=10)
try:
conn.request("GET", p)
res = conn.getresponse()
data = res.read()
except:
print ("[%s] impossible de récupérer la page %s."%(s, p))
return (None, None)
conn.close()
return (res.status, data)

View File

@ -1,46 +0,0 @@
# coding=utf-8
from datetime import datetime
import threading
class Watcher(threading.Thread):
def __init__(self):
self.servers = list()
self.stop = False
self.newSrv = threading.Event()
threading.Thread.__init__(self)
def addServer(self, server):
self.servers.append(server)
self.newSrv.set()
def check(self, closer):
closer.check()
self.newSrv.set()
def run(self):
while not self.stop:
self.newSrv.clear()
closer = None
#Gets the closer server update
for server in self.servers:
if server.update < datetime.now():
#print ("Closer now: %s à %s"%(server.url, server.update))
self.check(server)
elif closer is None or server.update < closer.update:
closer = server
if closer is not None:
#print ("Closer: %s à %s"%(closer.url, closer.update))
timeleft = (closer.update - datetime.now()).seconds
timer = threading.Timer(timeleft, self.check, (closer,))
timer.start()
#print ("Start timer (%ds)"%timeleft)
self.newSrv.wait()
if closer is not None and closer.update is not None and closer.update > datetime.now():
timer.cancel()
def stop(self):
self.stop = True
self.newSrv.set()

View File

@ -1,9 +1,18 @@
# coding=utf-8
nemubotversion = 3.0
from datetime import datetime
from datetime import timedelta
import http.client
import hashlib
import re
import socket
import sys
import traceback
from urllib.parse import unquote
from .Watcher import Watcher
from . import Site
from .atom import Atom
nemubotversion = 3.2
def help_tiny ():
"""Line inserted in the response to the command !help"""
@ -12,22 +21,141 @@ def help_tiny ():
def help_full ():
return "This module is autonomous you can't interract with it."
CONTEXT = None
WATCHER = None
def load(context):
"""Register watched website"""
global CONTEXT
CONTEXT = context
for site in DATAS.getNodes("watch"):
start_watching(site)
def unload(context):
"""Unregister watched website"""
for site in DATAS.getNodes("watch"):
context.del_event(site["evt_id"])
def start_watching(site):
print_debug("Add event for site: http://%s%s" % (site["server"], site["page"]))
evt = ModuleEvent(func=getPage, cmp_data=site["lastcontent"],
func_data=dict(s=site["server"], p=site["page"]),
intervalle=site.getInt("time"),
call=alert_change, call_data=site)
site["evt_id"] = CONTEXT.add_event(evt)
def load():
global WATCHER, DATAS, SRVS
#Load the watcher
Site.SRVS = SRVS
WATCHER = Watcher()
for site in DATAS.getNodes("watch"):
s = Site.Site(site)
WATCHER.addServer(s)
WATCHER.start()
def explore_url(url):
return re.match("^(http://)?([^/]+)(/.*)$", url)
def close():
global WATCHER
if WATCHER is not None:
WATCHER.stop = True
WATCHER.newSrv.set()
def found_site(s, p):
for site in DATAS:
if site["server"] == s and site["page"] == p:
return site
return None
def del_site(msg):
if len(msg.cmd) <= 1:
return Response(msg.sender, "quel site dois-je arrêter de surveiller ?",
msg.channel, msg.nick)
rx = explore_url(msg.cmd[1])
if rx is not None:
site = found_site(rx.group(2), rx.group(3))
if site is not None and (msg.sender == site["sender"] or msg.is_owner):
CONTEXT.del_event(site["evt_id"])
DATAS.delChild(site)
return Response(msg.sender, "je ne surveille désormais plus cette URL.",
channel=msg.channel, nick=msg.nick)
return Response(msg.sender, "je ne surveillais pas cette URL pour vous.",
channel=msg.channel, nick=msg.nick)
def add_site(msg):
if len(msg.cmd) <= 1:
return Response(msg.sender, "quel site dois-je surveiller ?",
msg.channel, msg.nick)
rx = explore_url(msg.cmd[1])
if rx is None:
return Response(msg.sender, "je ne peux pas surveiller cette URL",
channel=msg.channel, nick=msg.nick)
else:
watch = ModuleState("watch")
watch["sender"] = msg.sender
watch["irc"] = msg.srv.id
watch["channel"] = msg.channel
watch["type"] = "diff"
watch["server"] = rx.group(2)
watch["page"] = rx.group(3)
watch["time"] = 123
watch["message"] = "http://%s%s a changé !" % (watch["server"],
watch["page"])
DATAS.addChild(watch)
start_watching(watch)
save()
return Response(msg.sender, channel=msg.channel, nick=msg.nick
message="ce site est maintenant sous ma surveillance.")
def alert_change(content, site):
"""Alert when a change is detected"""
start_watching(site)
if content is None:
return
if site["type"] == "atom":
if site["_lastpage"] is None:
site["_lastpage"] = Atom(site["lastcontent"])
page = Atom(content)
diff = site["_lastpage"].diff(page)
if len(diff) > 0:
site["_lastpage"] = page
print_debug("[%s] Page differ!" % site["server"])
diff.reverse()
for d in diff:
categories = site.getNodes("categories")
categories.setIndex("term")
if site["message"].count("%s") == 2 and len(categories) > 0:
if d.category is None or d.category not in categories:
messageI = site["message"] % (categories[""], "%s")
else:
messageI = site["message"] % (categories[d.category], "%s")
send_response(site["irc"], Response(site["sender"],
messageI % d.link,
site["channel"]))
elif site["message"].count("%s") == 2:
send_response(site["irc"], Response(site["sender"],
site["message"] % (unquote(d.title), d.link),
site["channel"]))
elif site["message"].count("%s") == 1:
send_response(site["irc"], Response(site["sender"],
site["message"] % unquote (d.title),
site["channel"]))
else:
send_response(site["irc"], Response(site["sender"],
site["message"],
site["channel"]))
else:
return #Stop here, no changes, so don't save
else: # Just looking for any changes
send_response(site["irc"], Response(site["sender"], site["message"], site["channel"]))
site["lastcontent"] = content
save()
#TODO: built-in this function
def getPage(s, p):
"""Return the page content"""
print_debug("Looking http://%s%s"%(s,p))
conn = http.client.HTTPConnection(s, timeout=10)
try:
conn.request("GET", p)
res = conn.getresponse()
data = res.read()
except:
print ("[%s] impossible de récupérer la page %s."%(s, p))
return None
conn.close()
return data.decode()

View File

@ -7,59 +7,63 @@ from xml.dom.minidom import parseString
from xml.dom.minidom import getDOMImplementation
class AtomEntry:
def __init__ (self, node):
self.id = node.getElementsByTagName("id")[0].firstChild.nodeValue
if node.getElementsByTagName("title")[0].firstChild is not None:
self.title = node.getElementsByTagName("title")[0].firstChild.nodeValue
else:
self.title = ""
try:
self.updated = time.strptime(node.getElementsByTagName("updated")[0].firstChild.nodeValue[:19], "%Y-%m-%dT%H:%M:%S")
except:
try:
self.updated = time.strptime(node.getElementsByTagName("updated")[0].firstChild.nodeValue[:10], "%Y-%m-%d")
except:
print (node.getElementsByTagName("updated")[0].firstChild.nodeValue[:10])
self.updated = time.localtime ()
if len(node.getElementsByTagName("summary")) > 0 and node.getElementsByTagName("summary")[0].firstChild is not None:
self.summary = node.getElementsByTagName("summary")[0].firstChild.nodeValue
else:
self.summary = None
if len(node.getElementsByTagName("link")) > 0:
self.link = node.getElementsByTagName("link")[0].getAttribute ("href")
else:
self.link = None
if len (node.getElementsByTagName("category")) >= 1:
self.category = node.getElementsByTagName("category")[0].getAttribute ("term")
else:
self.category = None
if len (node.getElementsByTagName("link")) > 1:
self.link2 = node.getElementsByTagName("link")[1].getAttribute ("href")
else:
self.link2 = None
def __init__ (self, node):
self.id = node.getElementsByTagName("id")[0].firstChild.nodeValue
if node.getElementsByTagName("title")[0].firstChild is not None:
self.title = node.getElementsByTagName("title")[0].firstChild.nodeValue
else:
self.title = ""
try:
self.updated = time.strptime(node.getElementsByTagName("updated")[0].firstChild.nodeValue[:19], "%Y-%m-%dT%H:%M:%S")
except:
try:
self.updated = time.strptime(node.getElementsByTagName("updated")[0].firstChild.nodeValue[:10], "%Y-%m-%d")
except:
print (node.getElementsByTagName("updated")[0].firstChild.nodeValue[:10])
self.updated = time.localtime ()
if len(node.getElementsByTagName("summary")) > 0 and node.getElementsByTagName("summary")[0].firstChild is not None:
self.summary = node.getElementsByTagName("summary")[0].firstChild.nodeValue
else:
self.summary = None
if len(node.getElementsByTagName("link")) > 0:
self.link = node.getElementsByTagName("link")[0].getAttribute ("href")
else:
self.link = None
if len (node.getElementsByTagName("category")) >= 1:
self.category = node.getElementsByTagName("category")[0].getAttribute ("term")
else:
self.category = None
if len (node.getElementsByTagName("link")) > 1:
self.link2 = node.getElementsByTagName("link")[1].getAttribute ("href")
else:
self.link2 = None
class Atom:
def __init__ (self, string):
self.feed = parseString (string).documentElement
self.id = self.feed.getElementsByTagName("id")[0].firstChild.nodeValue
self.title = self.feed.getElementsByTagName("title")[0].firstChild.nodeValue
def __init__ (self, string):
self.raw = string
self.feed = parseString (string).documentElement
self.id = self.feed.getElementsByTagName("id")[0].firstChild.nodeValue
self.title = self.feed.getElementsByTagName("title")[0].firstChild.nodeValue
self.updated = None
self.entries = dict ()
for item in self.feed.getElementsByTagName("entry"):
entry = AtomEntry (item)
self.entries[entry.id] = entry
if self.updated is None or self.updated < entry.updated:
self.updated = entry.updated
self.updated = None
self.entries = dict ()
for item in self.feed.getElementsByTagName("entry"):
entry = AtomEntry (item)
self.entries[entry.id] = entry
if self.updated is None or self.updated < entry.updated:
self.updated = entry.updated
def diff (self, other):
differ = list ()
for k in other.entries.keys ():
if self.updated is None and k not in self.entries:
self.updated = other.entries[k].updated
if k not in self.entries and other.entries[k].updated >= self.updated:
differ.append (other.entries[k])
return differ
def __str__(self):
return self.raw
def diff (self, other):
differ = list ()
for k in other.entries.keys ():
if self.updated is None and k not in self.entries:
self.updated = other.entries[k].updated
if k not in self.entries and other.entries[k].updated >= self.updated:
differ.append (other.entries[k])
return differ
if __name__ == "__main__":