1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 | Tools/webchecker/websucker.py
#! /usr/bin/env python """A variant on webchecker that creates a mirror copy of a remote site.""" __version__ = "$Revision$" import os import sys import urllib import getopt import webchecker # Extract real version number if necessary if __version__[0] == '$': _v = __version__.split() if len(_v) == 3: __version__ = _v[1] def main(): verbose = webchecker.VERBOSE try: opts, args = getopt.getopt(sys.argv[1:], "qv") except getopt.error, msg: print msg print "usage:", sys.argv[0], "[-qv] ... [rooturl] ..." return 2 for o, a in opts: if o == "-q": verbose = 0 if o == "-v": verbose = verbose + 1 c = Sucker() c.setflags(verbose=verbose) c.urlopener.addheaders = [ ('User-agent', 'websucker/%s' % __version__), ] for arg in args: print "Adding root", arg c.addroot(arg) print "Run..." c.run() class Sucker(webchecker.Checker): checkext = 0 nonames = 1 # SAM 11/13/99: in general, URLs are now URL pairs. # Since we've suppressed name anchor checking, # we can ignore the second dimension. def readhtml(self, url_pair): url = url_pair[0] text = None path = self.savefilename(url) try: f = open(path, "rb") except IOError: f = self.openpage(url_pair) if f: info = f.info() nurl = f.geturl() if nurl != url: url = nurl path = self.savefilename(url) text = f.read() f.close() self.savefile(text, path) if not self.checkforhtml(info, url): text = None else: if self.checkforhtml({}, url): text = f.read() f.close() return text, url def savefile(self, text, path): dir, base = os.path.split(path) makedirs(dir) try: f = open(path, "wb") f.write(text) f.close() self.message("saved %s", path) except IOError, msg: self.message("didn't save %s: %s", path, str(msg)) def savefilename(self, url): type, rest = urllib.splittype(url) host, path = urllib.splithost(rest) path = path.lstrip("/") user, host = urllib.splituser(host) host, port = urllib.splitnport(host) host = host.lower() if not path or path[-1] == "/": path = path + "index.html" if os.sep != "/": path = os.sep.join(path.split("/")) path = os.path.join(host, path) return path def makedirs(dir): if not dir: return if os.path.exists(dir): if not os.path.isdir(dir): try: os.rename(dir, dir + ".bak") os.mkdir(dir) os.rename(dir + ".bak", os.path.join(dir, "index.html")) except os.error: pass return head, tail = os.path.split(dir) if not tail: print "Huh? Don't know how to make dir", dir return makedirs(head) os.mkdir(dir, 0777) if __name__ == '__main__': sys.exit(main() or 0) |