1. /**
  2. * A small webcrawler. Actually ported from the python crawler Webtangler
  3. * Nikhil Marathe nsm.nikhil@gmail.com
  4. */
  5. using System;
  6. using System.Text.RegularExpressions;
  7. using System.Collections;
  8. using System.IO;
  9. using System.Net;
  10. public class Spider {
  11. Queue urlList = new Queue();
  12. StreamWriter urlFileStr;
  13. Regex reg;
  14. public Spider(int depth, int times, string portal, string output)
  15. {
  16. reg = new Regex("http://?[a-zA-Z0-9-\\.\\/]+");
  17. try
  18. {
  19. FileStream urlFile = new FileStream(output, FileMode.OpenOrCreate, FileAccess.Write);
  20. urlFileStr = new StreamWriter(urlFile);
  21. urlList.Enqueue(portal);
  22. while(times > 0)
  23. {
  24. crawl(depth);
  25. times--;
  26. }
  27. urlFile.Close();
  28. }
  29. catch(Exception e)
  30. {
  31. Console.WriteLine("Error: " + e);
  32. }
  33. }
  34. /*
  35. * A recursive function to crawl with decreasing depth
  36. */
  37. private void crawl(int depth)
  38. {
  39. if (depth < 1) return;
  40. string curLink = (string)urlList.Dequeue();
  41. Console.WriteLine(curLink);
  42. //ArrayList url_contents = new ArrayList();
  43. StreamReader urlData = new StreamReader(new WebClient().OpenRead(curLink));
  44. string line = null;
  45. while((line = urlData.ReadLine()) != null) {
  46. Match match = reg.Match(line);
  47. Console.WriteLine("Line " + line);
  48. Console.WriteLine("Match " + match);
  49. urlList.Enqueue(match);
  50. urlFileStr.Write(match+"\n");
  51. }
  52. crawl(depth-1);
  53. }
  54. public static void Main()
  55. {
  56. new Spider(2, 2, "http://www.kde.org/", "urls.txt");
  57. }
  58. }