This class is pretty much the main crawler engine. This was orginally written in one file, but I decided to split off some of the extraneous functionality to the CrawlerUtil class…
Here is Spider.cs:
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Net;
using System.IO;
using TauCephei.Helpers;
using HtmlAgilityPack;
namespace TauCephei.Crawler
{
public sealed class Spider
{
// User agent string
const string UA =
"Mozilla/5.0 (compatible; Tau Cephei; +http://eksith.wordpress.com/2011/06/12/web-crawler/)";
// Allowed file extensions to crawl
private static string[] WHITE_LIST =
{ "htm", "html", "xhtml", "xht", "php", "jsp", "asp", "aspx", "pl", "py", "rb", "cgi" };
// Only these content types will be indexed
private static string[] CONTENT_TYPES =
{ "text/plain", "text/html", "text/xhtml", "application/xhtml+xml" };
// Binary and non-text content (Iframe crawling in the future)
private static string[] JUNK_TAGS = { "script", "style", "video", "embed", "object", "iframe" };
// In page anchors and helpers
private static string[] IGNORE_START =
{ "#", "javascript:", "mailto:", "skype:" };
// This was because I was testing this on wordpress.com
private static string[] IGNORE_END =
{ "?share=digg", "?share=facebook", "?share=stumbleupon", "?share=reddit", "?share=email" };
const int DEPTH = 2; // Maximum crawl depth per Uri
const int TIMEOUT = 1000; // Timeout per request
const int REDIRECTS = 3; // Maximum number of redirects allowed
const int HEADERS = 8; // Maximum headers
const int CONTENT = 80000; // Maximum content length
// Webmasters with bandwith limits will thank you for this...
const int MAX_PAGES = 30; // Maximum pages per crawl
// Global external crawl links
private List<string> _externalUrls = new List<string>();
// Global local crawl links
private List<string> _crawlUrls = new List<string>();
// Global crawl tracker
private List<string> _alreadyCrawled = new List<string>();
// Global depth tracker
private int _crawlDepth = 0;
// Global iteration tracker
private int _iteration = 0;
// Global max pages tracker
private int _maxIteration = 0;
// Uncrawlable or invalid links
private static Dictionary<string, string> _failedUrls =
new Dictionary<string, string>();
// Robots flag for this URI
private static bool _robotsParsed = false;
// Main authority
private static Uri _root;
public List<Entry> _crawledEntries = new List<Entry>();
public List<Entry> CrawledEntries
{
get { return _crawledEntries; }
}
public Dictionary<string, string> FailedUrls
{
get { return _failedUrls; }
}
public List<string> CrawlUrls
{
get { return _crawlUrls; }
}
public List<string> ExternalUrls
{
get { return _externalUrls; }
}
public List<string> AlreadyCrawled
{
get { return _alreadyCrawled; }
}
// Constructor
public Spider()
{
_crawlDepth = DEPTH;
_maxIteration = MAX_PAGES;
}
/// <summary>
/// Crawls a given url
/// </summary>
/// <param name="url"></param>
public void Crawl(string url)
{
_root = CrawlUtil.ConvertToUri(_root, url);
url = _root.GetLeftPart(UriPartial.Query);
if (_alreadyCrawled.Contains(url))
{
_crawlUrls.Remove(url);
}
else
{
// Check if robots.txt was parsed for this URL
if (!_robotsParsed)
ParseRobots(url);
// Add the current url to the crawled tracker
_alreadyCrawled.Add(url);
Entry entry = GetEntry(GetContent(_root));
if (entry != null)
{
// Set entry parameters
entry.Id = Util.GetChecksum(url, "sha1");
entry.Url = url;
entry.LastCrawl = DateTime.Now;
Console.WriteLine("Entry created...");
Console.WriteLine(entry.Id);
Console.WriteLine(entry.Title);
Console.WriteLine("Iteration : " + _iteration +
" | Depth : " + _crawlDepth +" | Pages remaining : "+
_maxIteration);
Console.WriteLine("\n\n");
// We can now save this entry or do something else.
// I just added this to a list
_crawledEntries.Add(entry);
// Continue crawling for as long as we have depth
while (_crawlDepth > 0)
{
if (_iteration <= 0)
_iteration = _crawlUrls.Count;
while (_iteration > 0)
{
if (_crawlUrls.Count > 0 && _maxIteration > 0)
{
_maxIteration--;
Crawl(_crawlUrls[0]);
}
_iteration--;
}
_crawlDepth--;
}
}
}
}
/// <summary>
/// Downloads the given uri and returns HtmlDocument
/// </summary>
private HtmlDocument GetContent(Uri uri)
{
HtmlDocument doc = new HtmlDocument();
HttpWebRequest req =
(HttpWebRequest)WebRequest.Create(uri.AbsoluteUri);
//Following options were iffy on HtmlAgilityPack
//doc.OptionFixNestedTags = false;
//doc.OptionAutoCloseOnEnd = false;
// Set the request properties
req.UserAgent = UA;
req.Timeout = TIMEOUT;
req.MaximumAutomaticRedirections = REDIRECTS;
req.MaximumResponseHeadersLength = HEADERS;
try
{
HttpWebResponse resp = (HttpWebResponse)req.GetResponse();
// If the content type is readable and isn't too large
if (IsValidContentType(resp.ContentType) &&
resp.ContentLength <= CONTENT)
doc.Load(resp.GetResponseStream());
else
_failedUrls.Add(uri.AbsoluteUri,
"Invalid content type or content size too large");
}
catch (WebException ex)
{
_failedUrls.Add(uri.AbsoluteUri,
CrawlUtil.GetException(ex));
}
return doc;
}
/// <summary>
/// Parse and retrieves the page as an Entry object
/// </summary>
/// <param name="doc">HtmlDocument to parse</param>
/// <returns>Entry object</returns>
private Entry GetEntry(HtmlDocument doc)
{
bool _index = true;
bool _follow = true;
Entry entry = new Entry();
// Parse the meta tags
entry.Meta = CrawlUtil.GetMetaTags(doc);
// If the robots tag is defined
string r = CrawlUtil.GetMetaKey(entry.Meta, "robots", "index,follow");
r = r.ToLower().Trim();
if (r.LastIndexOf("nofollow") >= 0)
_follow = false;
if (r.LastIndexOf("noindex") >= 0)
_index = false;
// If we can't follow or index the page, then move on
if (!_follow && !_index)
return null;
// Get the document description info
entry.Title = CrawlUtil.GetTagValue(doc, "title");
entry.Description = CrawlUtil.GetMetaKey(entry.Meta, "description", "");
// Give extra space for the abstract
entry.Abstract = CrawlUtil.GetMetaKey(entry.Meta, "abstract", "", 500);
entry.Author = CrawlUtil.GetMetaKey(entry.Meta, "author", "");
entry.Copyright = CrawlUtil.GetMetaKey(entry.Meta, "copyright", "");
// A little hesitant to use this due to the abuse potential
if (entry.Meta.ContainsKey("keywords"))
entry.Keywords = Util.GetKeywords(entry.Meta["keywords"], 6, true);
if (entry.Meta.ContainsKey("content-type"))
entry.Encoding = CrawlUtil.GetEncoding(entry.Meta["content-type"]);
// Get all the valid links on this page
List<string> links = GetLinks(doc);
foreach (string href in links)
{
Uri uri = CrawlUtil.ConvertToUri(_root, href);
if (_root.IsBaseOf(uri))
{
// Add only links that we can follow
if (_follow)
{
// Global local crawlable links
if (!_crawlUrls.Contains(href) &&
!_alreadyCrawled.Contains(href))
_crawlUrls.Add(href);
if (!entry.LocalLinks.Contains(href))
entry.LocalLinks.Add(href);
}
}
else
{
if (_follow)
{
// Global external crawlable links
if (!_externalUrls.Contains(href))
_externalUrls.Add(href);
if (!entry.ExternalLinks.Contains(href))
entry.ExternalLinks.Add(href);
}
}
}
// If indexing is enabled, get the content body
if (_index)
{
var body = (from b in doc.DocumentNode.Descendants()
where b.Name == "body"
select b).FirstOrDefault();
if (body != null)
{
// Clean the junk tags first
foreach (var n in body.ChildNodes.ToArray())
{
if (JUNK_TAGS.Contains(n.Name))
n.Remove();
}
entry.BodyHtml = body.InnerHtml;
entry.BodyText = body.InnerText;
}
}
return entry;
}
#region Helper methods
/// <summary>
/// Retrieves a given url as a string. So far for robots.txt
/// </summary>
private static string GetStringContent(string url)
{
string content = "";
using (CrawlClient client = new CrawlClient())
{
client.Headers["User-Agent"] = UA;
client.Timeout = TIMEOUT;
try
{
content = client.DownloadString(url);
}
catch (WebException ex)
{
if (!url.EndsWith("robots.txt"))
_failedUrls.Add(url, CrawlUtil.GetException(ex));
}
}
return content;
}
/// <summary>
/// This function is courtesy of R.Reid with very little modification
/// http://www.strictly-software.com/robotstxt
/// </summary>
private static void ParseRobots(string url)
{
Uri curr = new Uri(url);
// Get the robots.txt file from the root domain
string content = GetStringContent("http://" + curr.Authority + "/robots.txt");
if (!string.IsNullOrEmpty(content))
{
Console.WriteLine("Robots.txt found");
string[] lines = content.Split(
Environment.NewLine.ToCharArray(),
StringSplitOptions.RemoveEmptyEntries);
bool apply = false;
foreach (string line in lines)
{
RobotCommand cmd = new RobotCommand(line);
switch (cmd.Command)
{
case "COMMENT":
break;
case "user-agent":
if (cmd.UserAgent.IndexOf("*") >= 0 ||
cmd.UserAgent.IndexOf("tau cephei") >= 0 ||
cmd.UserAgent.IndexOf("taucephei") >= 0)
apply = true;
break;
case "disallow":
if (apply)
{
if (cmd.Url.Length > 0)
{
string c = cmd.Url.ToLower();
if (!_failedUrls.ContainsKey(c))
_failedUrls.Add(c, "Disallowed in robots.txt");
}
}
break;
case "allow":
break;
}
}
}
else
{
Console.WriteLine("No robots.txt found");
}
_robotsParsed = true;
}
/// <summary>
/// Gets a list of links in the given HtmlDocument
/// </summary>
private static List<string> GetLinks(HtmlDocument doc)
{
List<string> links = new List<string>();
// Blank links are probably not meant for bots
var list = (from l in doc.DocumentNode.Descendants()
where
l.Name == "a" &&
l.InnerText != null &&
l.Attributes["href"] != null
select l);
// Parse through all the links
foreach (var l in list)
{
if (l.Attributes["rel"] != null)
{
string rel = l.Attributes["rel"].Value;
if (!String.IsNullOrEmpty(rel))
{
if (rel.ToLower().Trim().IndexOf("nofollow") >= 0)
continue; // No need to index no follow links
}
}
// Find remaining links
string href = l.Attributes["href"].Value;
if (!String.IsNullOrEmpty(href))
{
href = href.ToLower().Trim();
Uri uri = GetCleanUri(href);
if (uri != null)
{
// Get everything up to the querystring (ignores # etc..)
links.Add(uri.GetLeftPart(UriPartial.Query));
}
}
}
return links;
}
/// <summary>
/// Checks the validity of a given link
/// </summary>
/// <param name="l">Link</param>
/// <returns>True if crawlable. Defaults to true.</returns>
private static Uri GetCleanUri(string l)
{
bool result = true;
Uri uri = null;
StringComparison cmp = StringComparison.InvariantCultureIgnoreCase;
for (int i = 0; i < IGNORE_START.Length; i++)
{
if (l.StartsWith(IGNORE_START[i], cmp))
result = false;
}
for (int i = 0; i < IGNORE_END.Length; i++)
{
if (l.EndsWith(IGNORE_END[i], cmp))
result = false;
}
if (l.IndexOf("=http") > 0) { result = false; }
// If we didn't fail yet...
if (result)
{
uri = CrawlUtil.ConvertToUri(_root, l);
for (int i = 0; i < WHITE_LIST.Length; i++)
{
if (uri.IsFile &&
!uri.AbsolutePath.EndsWith(WHITE_LIST[i], cmp))
result = false;
}
}
if (result)
return uri;
return null;
}
/// <summary>
/// Check if the content type matches the constants
/// </summary>
private static bool IsValidContentType(string ct)
{
foreach (string c in CONTENT_TYPES)
{
if (ct.StartsWith(c)) return true;
}
return false;
}
/// <summary>
/// Checks if the given Url isn't part of the blocked list.
/// This function is also courtesy of R. Reid
/// http://www.strictly-software.com/robotstxt
/// </summary>
private static bool UrlIsAllowed(string url)
{
// If there are no failed URLs, then we can continue
if (_failedUrls.Count == 0) return true;
bool result = true;
Uri uri = CrawlUtil.ConvertToUri(_root, url);
url = uri.AbsolutePath.ToLower();
if (url == "/robots.txt")
{
result = false;
}
else
{
foreach (KeyValuePair<string, string> entry in _failedUrls)
{
if (url.Length >= entry.Key.Length)
if (url.Substring(0, entry.Key.Length) == entry.Key)
result = false;
}
}
return result;
}
#endregion
}
}
Onward to Program.cs —–>
Finally, we have the main Program.cs file for the console app. I would create the Spider() class and run it in a seperate thread each time the Crawl function is called from within the Program class
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;
using TauCephei.Helpers;
using TauCephei.Crawler;
namespace TauCephei
{
class Program
{
static void Main(string[] args)
{
Console.Title = "Tau Cephei";
Spider spider = new Spider();
spider.Crawl("http://google.com");
// Let's pull up what we have from that...
List<string> NewCrawlUrls = spider.CrawlUrls;
List<string> NewExternalUrls = spider.ExternalUrls;
Dictionary<string, string> FailedUrls = spider.FailedUrls;
// This is what you may want to save to a database
List<Entry> Entries = spider.CrawledEntries;
Console.ReadLine();
}
}
}
And that should be that…
Enjoy!
Pingback: Web Crawler - USEFUL PORTAL – USEFUL PORTAL