#! /usr/local/bin/python
# ex:ts=4
import urllib, urlparse
import re, os.path, os
urlfind1 = re.compile(r'[<](\w*)[^>]*[Ss][Rr][Cc]=([^"\' ]\S*)[^>]*[>]')
urlfind2 = re.compile(r'[<](\w*)[^>]*[Ss][Rr][Cc]=["\']([^"\']*)["\'][^>]*[>]')
def recurget(url, got=None, depth=0):
if got is None: got = []
print " "*depth + "Getting %s ..." % url
content = urllib.urlopen(url).read()
urls = urlfind1.findall(content) + urlfind2.findall(content)
filepath = urlparse.urlparse(url)[2][1:]
try:
os.makedirs(os.path.dirname(filepath))
except OSError:
pass
open(filepath, "w").write(content)
for tag, target in urls:
turl = urlparse.urljoin(url, target)
print " "*depth + " + Tag found (%s to %s) => %s" % (tag, target, turl)
if turl not in got:
got.append(turl)
recurget(turl, got, depth+1)
if __name__ == "__main__":
import sys
for url in sys.argv[1:]:
recurget(url)
|