#!/int/python1.5 # # notify.py -- notify about web page changes # # Keith Waclena # __author__ = "Keith Waclena, k-waclena@uchicago.edu" __version__ = "0.2" __patchlevel__ = "0" __production__ = 0 CHUNKSIZE = 16 * 1024 import sys, shelve, urllib, md5 from urlparse import urlparse, urlunparse from string import join import main class summary: def __init__(self, digest, size): self.digest, self.size = digest, size def summarize(fp): """Return an MD5 message digest (checksum) of all the data on open file fp. """ m = md5.new() size = 0 while 1: chunk = fp.read(CHUNKSIZE) if not chunk: break m.update(chunk) size = size + len(chunk) return summary(m.digest(), size) def normalize(url): """Normalize url by stripping any query and fragment parts; also, if original url was of the form `www.foo.com', convert this to `http://www.foo.com'. """ (scheme, netloc, path, _, _, frag) = urlparse(url, "http") if not netloc and path: return urlunparse((scheme, path, "", "", "", "")) else: return urlunparse((scheme, netloc, path, "", "", "")) class Main(main.main): Usage = "-[vqadscu] database [url ...]" try: options = [main.opt("v", doc = "set VERBOSITY level to 1"), main.opt("q", doc = "quiet mode; no output, only exit status"), main.opt("a", doc = "add urls to database, don't check"), main.opt("d", doc = "delete urls from database, don't check"), main.opt("s", doc = "show urls in database"), main.opt("c", doc = "list urls that have changed"), main.opt("u", doc = "autoupdate of checksums of urls that have" " changed"),] except main.opt, o: main.dryrot("%s: %s" % (o, o.error)) def main(self): ((self.verbose.verbosity, self.quiet, adding, deleting, showing, self.changed, self.updating), self.args) = main.opts(self.options).getopt("v", "q", "a", "d","s", "c", "u") self.db = None if self.args: self.database = self.args[0] self.db = shelve.open(self.database) self.args = self.args[1:] try: if adding: self.add() elif deleting: self.delete() elif showing: self.show() else: # checking... if self.args: # check all urls in database allswell = self.check(self.args) else: # check named urls only allswell = self.check(self.db.keys()) if allswell or self.changed: sys.exit(0) else: sys.exit(1) finally: if self.db: self.db.close() def check(self, urls): """Check all URLs in urls (a sequence of strings). """ from operator import __and__ results = [] for url in urls: url = normalize(url) self.verbose.partial(1, "%s, " % url) try: fp = urllib.urlopen(url) except IOError, (_, msg): self.verbose.finish(1, "%s" % msg) continue else: self.verbose.finish(1, "done") try: previous = self.db[url].digest except KeyError: self.warning("%s not in database" % url) continue s = summarize(fp) if s.digest == previous: results.append(1) if self.quiet: pass elif self.changed: pass else: print "same\t%s" % url else: results.append(0) if self.quiet: pass elif self.changed: print url else: print "DIFFER\t%s" % url if self.updating: self.verbose(1, "updating %s" % url) self.db[url] = s return reduce(__and__, results, 1) def add(self): """Add urls in self.args to database. """ for url in self.args: url = normalize(url) self.verbose.partial(1, "adding %s, " % url) try: fp = urllib.urlopen(url) except IOError, (_, msg): self.verbose.finish(1, "%s" % msg) continue else: self.verbose.finish(1, "done") self.db[url] = summarize(fp) def delete(self): """Delete urls in self.args from database. """ for url in self.args: url = normalize(url) try: del self.db[url] self.verbose(1, "deleted %s" % url) except KeyError: self.warning("%s not in database" % url) def show(self): """Show all urls in database in a parsable form on stdout. """ def md5repr(digest): return join(map(lambda c: "%x" % ord(c), digest), "") for key in self.db.keys(): s = self.db[key] print "%s\t%s\t%s" % (md5repr(s.digest), s.size, key) Main(production = __production__)() # Local Variables: # py-indent-offset: 4 # fill-column: 76 # End: