1. #!/usr/bin/env python
  2. ############################################################################
  3. # Copyright (C) 2005 by Nikhil Marathe #
  4. # nsm.nikhil@gmail.com #
  5. # #
  6. # This program is free software; you can redistribute it and#or modify #
  7. # it under the terms of the GNU General Public License as published by #
  8. # the Free Software Foundation; either version 2 of the License, or #
  9. # (at your option) any later version. #
  10. # #
  11. # This program is distributed in the hope that it will be useful, #
  12. # but WITHOUT ANY WARRANTY; without even the implied warranty of #
  13. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
  14. # GNU General Public License for more details. #
  15. # #
  16. # You should have received a copy of the GNU General Public License #
  17. # along with this program; if not, write to the #
  18. # Free Software Foundation, Inc., #
  19. # 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. #
  20. ############################################################################
  21. import sys
  22. import getopt
  23. import re
  24. from urllib import urlopen
  25. class WebTangler(object):
  26. urlList = []
  27. def appendToFile(self, st):
  28. self.file.write(st+'\n')
  29. def __init__(self, depth, times, portal, output):
  30. #match all urls which have the keyword
  31. self.reg_exp = re.compile(r'<a href=["\']?(http://?[a-zA-Z0-9\-\./]+)["\']?>')
  32. self.file = open(output, 'wa')
  33. self.urlList.append(portal)
  34. while times:
  35. self.crawl(depth)
  36. times-=1
  37. self.file.close()
  38. def crawl(self, depth):
  39. if depth < 1:return
  40. currLink = self.urlList.pop()
  41. url_contents = []
  42. #read in the file
  43. for i in urlopen(currLink).readlines():url_contents.append(i)
  44. #create a string
  45. url_contents = ''.join(url_contents)
  46. #replace all whitespace with spaces
  47. url_contents.replace(' \n\t', ' ')
  48. #print url_contents
  49. for link in re.findall(self.reg_exp, url_contents):
  50. self.urlList.append(link)
  51. self.appendToFile(link)
  52. self.crawl(depth-1)
  53. try:
  54. switches, x = getopt.getopt(sys.argv[1:], None, ['depth=', 'times=', 'portal=', 'output='])
  55. #not all args passed
  56. if len(switches) < 4:
  57. raise getopt.GetoptError, "Not enough args"
  58. WebTangler(int(switches[0][1]), int(switches[1][1]), switches[2][1], switches[3][1])
  59. except getopt.GetoptError:
  60. print '''
  61. webtangler.py -- a simple python webcrawler.
  62. Arguments:
  63. --depth=n - depth to descend into one link.
  64. --times=n - after how many times should WebTangler stop.
  65. --portal=site - any link in complete format(eg. http://www.python.org/)
  66. --output=file - file to output links to
  67. Have fun using it
  68. '''