-
-
-
-
- using System;
- using System.Text.RegularExpressions;
- using System.Collections;
- using System.IO;
- using System.Net;
-
- public class Spider {
- Queue urlList = new Queue();
- StreamWriter urlFileStr;
- Regex reg;
- public Spider(int depth, int times, string portal, string output)
- {
- reg = new Regex("http://?[a-zA-Z0-9-\\.\\/]+");
- try
- {
- FileStream urlFile = new FileStream(output, FileMode.OpenOrCreate, FileAccess.Write);
- urlFileStr = new StreamWriter(urlFile);
-
- urlList.Enqueue(portal);
- while(times > 0)
- {
- crawl(depth);
- times--;
- }
- urlFile.Close();
- }
- catch(Exception e)
- {
- Console.WriteLine("Error: " + e);
- }
- }
-
-
-
-
- private void crawl(int depth)
- {
- if (depth < 1) return;
- string curLink = (string)urlList.Dequeue();
- Console.WriteLine(curLink);
-
- StreamReader urlData = new StreamReader(new WebClient().OpenRead(curLink));
- string line = null;
- while((line = urlData.ReadLine()) != null) {
- Match match = reg.Match(line);
- Console.WriteLine("Line " + line);
- Console.WriteLine("Match " + match);
- urlList.Enqueue(match);
- urlFileStr.Write(match+"\n");
- }
- crawl(depth-1);
- }
-
- public static void Main()
- {
- new Spider(2, 2, "http://www.kde.org/", "urls.txt");
- }
- }
-
-