-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- import sys
- import getopt
- import re
- from urllib import urlopen
-
- class WebTangler(object):
- urlList = []
-
- def appendToFile(self, st):
- self.file.write(st+'\n')
-
- def __init__(self, depth, times, portal, output):
-
-
- self.reg_exp = re.compile(r'<a href=["\']?(http://?[a-zA-Z0-9\-\./]+)["\']?>')
- self.file = open(output, 'wa')
- self.urlList.append(portal)
- while times:
- self.crawl(depth)
- times-=1
- self.file.close()
-
- def crawl(self, depth):
- if depth < 1:return
- currLink = self.urlList.pop()
- url_contents = []
-
- for i in urlopen(currLink).readlines():url_contents.append(i)
-
- url_contents = ''.join(url_contents)
-
- url_contents.replace(' \n\t', ' ')
-
- for link in re.findall(self.reg_exp, url_contents):
- self.urlList.append(link)
- self.appendToFile(link)
- self.crawl(depth-1)
-
- try:
- switches, x = getopt.getopt(sys.argv[1:], None, ['depth=', 'times=', 'portal=', 'output='])
-
- if len(switches) < 4:
- raise getopt.GetoptError, "Not enough args"
- WebTangler(int(switches[0][1]), int(switches[1][1]), switches[2][1], switches[3][1])
- except getopt.GetoptError:
- print '''
- webtangler.py -- a simple python webcrawler.
-
- Arguments:
- --depth=n - depth to descend into one link.
- --times=n - after how many times should WebTangler stop.
- --portal=site - any link in complete format(eg. http://www.python.org/)
- --output=file - file to output links to
-
- Have fun using it
- '''