A C# web crawler with page details

This class is pretty much the main crawler engine. This was orginally written in one file, but I decided to split off some of the extraneous functionality to the CrawlerUtil class…

Here is Spider.cs:

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Net;
using System.IO;

using TauCephei.Helpers;
using HtmlAgilityPack;

namespace TauCephei.Crawler
{
    public sealed class Spider
    {
        // User agent string
        const string UA =
            "Mozilla/5.0 (compatible; Tau Cephei; +https://eksith.wordpress.com/2011/06/12/web-crawler/)";

        // Allowed file extensions to crawl
        private static string[] WHITE_LIST = 
        { "htm", "html", "xhtml", "xht", "php", "jsp", "asp", "aspx", "pl", "py", "rb", "cgi" };

        // Only these content types will be indexed
        private static string[] CONTENT_TYPES = 
        { "text/plain", "text/html", "text/xhtml", "application/xhtml+xml" };

        // Binary and non-text content (Iframe crawling in the future)
        private static string[] JUNK_TAGS = { "script", "style", "video", "embed", "object", "iframe" };

        // In page anchors and helpers
        private static string[] IGNORE_START = 
        { "#", "javascript:", "mailto:", "skype:" };

        // This was because I was testing this on wordpress.com
        private static string[] IGNORE_END = 
        { "?share=digg", "?share=facebook", "?share=stumbleupon", "?share=reddit", "?share=email" };

        const int DEPTH = 2;            // Maximum crawl depth per Uri
        const int TIMEOUT = 1000;        // Timeout per request
        const int REDIRECTS = 3;        // Maximum number of redirects allowed
        const int HEADERS = 8;            // Maximum headers
        const int CONTENT = 80000;        // Maximum content length

        // Webmasters with bandwith limits will thank you for this...
        const int MAX_PAGES = 30;        // Maximum pages per crawl

        // Global external crawl links
        private List<string> _externalUrls = new List<string>();

        // Global local crawl links
        private List<string> _crawlUrls = new List<string>();

        // Global crawl tracker
        private List<string> _alreadyCrawled = new List<string>();

        // Global depth tracker
        private int _crawlDepth = 0;

        // Global iteration tracker
        private int _iteration = 0;

        // Global max pages tracker
        private int _maxIteration = 0;

        // Uncrawlable or invalid links
        private static Dictionary<string, string> _failedUrls = 
            new Dictionary<string, string>();

        // Robots flag for this URI
        private static bool _robotsParsed = false;

        // Main authority
        private static Uri _root;


        public List<Entry> _crawledEntries = new List<Entry>();
        public List<Entry> CrawledEntries
        {
            get { return _crawledEntries; }
        }

        public Dictionary<string, string> FailedUrls
        {
            get { return _failedUrls; }
        }

        public List<string> CrawlUrls
        {
            get { return _crawlUrls; }
        }

        public List<string> ExternalUrls
        {
            get { return _externalUrls; }
        }

        public List<string> AlreadyCrawled
        {
            get { return _alreadyCrawled; }
        }

        // Constructor
        public Spider()
        {
            _crawlDepth = DEPTH;
            _maxIteration = MAX_PAGES;
        }

        /// <summary>
        /// Crawls a given url
        /// </summary>
        /// <param name="url"></param>
        public void Crawl(string url)
        {
            _root = CrawlUtil.ConvertToUri(_root, url);
            url = _root.GetLeftPart(UriPartial.Query);

            if (_alreadyCrawled.Contains(url))
            {
                _crawlUrls.Remove(url);
            }
            else
            {
                // Check if robots.txt was parsed for this URL
                if (!_robotsParsed)
                    ParseRobots(url);

                // Add the current url to the crawled tracker
                _alreadyCrawled.Add(url);
                Entry entry = GetEntry(GetContent(_root));

                if (entry != null)
                {
                    // Set entry parameters
                    entry.Id = Util.GetChecksum(url, "sha1");
                    entry.Url = url;
                    entry.LastCrawl = DateTime.Now;


                    Console.WriteLine("Entry created...");
                    Console.WriteLine(entry.Id);
                    Console.WriteLine(entry.Title);
                    Console.WriteLine("Iteration : " + _iteration +
                        " | Depth : " + _crawlDepth +" | Pages remaining : "+ 
                        _maxIteration);
                    Console.WriteLine("\n\n");

                    // We can now save this entry or do something else.
                    // I just added this to a list
                    _crawledEntries.Add(entry);

                    // Continue crawling for as long as we have depth
                    while (_crawlDepth > 0)
                    {
                        if (_iteration <= 0)
                            _iteration = _crawlUrls.Count;

                        while (_iteration > 0)
                        {
                            if (_crawlUrls.Count > 0 && _maxIteration > 0)
                            {
                                _maxIteration--;
                                Crawl(_crawlUrls[0]);
                            }
                            _iteration--;
                        }

                        _crawlDepth--;
                    }

                }
            }
        }

        /// <summary>
        /// Downloads the given uri and returns HtmlDocument
        /// </summary>
        private HtmlDocument GetContent(Uri uri)
        {
            HtmlDocument doc = new HtmlDocument();
            HttpWebRequest req = 
                (HttpWebRequest)WebRequest.Create(uri.AbsoluteUri);

            //Following options were iffy on HtmlAgilityPack
            //doc.OptionFixNestedTags = false;
            //doc.OptionAutoCloseOnEnd = false;

            // Set the request properties
            req.UserAgent = UA;
            req.Timeout = TIMEOUT;
            req.MaximumAutomaticRedirections = REDIRECTS;
            req.MaximumResponseHeadersLength = HEADERS;

            try
            {
                HttpWebResponse resp = (HttpWebResponse)req.GetResponse();

                // If the content type is readable and isn't too large
                if (IsValidContentType(resp.ContentType) && 
                    resp.ContentLength <= CONTENT)
                    doc.Load(resp.GetResponseStream());
                else
                    _failedUrls.Add(uri.AbsoluteUri,
                        "Invalid content type or content size too large");
            }
            catch (WebException ex)
            {
                _failedUrls.Add(uri.AbsoluteUri,
                    CrawlUtil.GetException(ex));
            }

            return doc;
        }

        /// <summary>
        /// Parse and retrieves the page as an Entry object
        /// </summary>
        /// <param name="doc">HtmlDocument to parse</param>
        /// <returns>Entry object</returns>
        private Entry GetEntry(HtmlDocument doc)
        {
            bool _index = true;
            bool _follow = true;

            Entry entry = new Entry();

            // Parse the meta tags
            entry.Meta = CrawlUtil.GetMetaTags(doc);

            // If the robots tag is defined
            string r = CrawlUtil.GetMetaKey(entry.Meta, "robots", "index,follow");

            r = r.ToLower().Trim();

            if (r.LastIndexOf("nofollow") >= 0)
                _follow = false;
            if (r.LastIndexOf("noindex") >= 0)
                _index = false;

            // If we can't follow or index the page, then move on
            if (!_follow && !_index)
                return null;

            // Get the document description info
            entry.Title = CrawlUtil.GetTagValue(doc, "title");
            entry.Description = CrawlUtil.GetMetaKey(entry.Meta, "description", "");

            // Give extra space for the abstract
            entry.Abstract = CrawlUtil.GetMetaKey(entry.Meta, "abstract", "", 500);

            entry.Author = CrawlUtil.GetMetaKey(entry.Meta, "author", "");
            entry.Copyright = CrawlUtil.GetMetaKey(entry.Meta, "copyright", "");

            // A little hesitant to use this due to the abuse potential
            if (entry.Meta.ContainsKey("keywords"))
                entry.Keywords = Util.GetKeywords(entry.Meta["keywords"], 6, true);

            if (entry.Meta.ContainsKey("content-type"))
                entry.Encoding = CrawlUtil.GetEncoding(entry.Meta["content-type"]);

            // Get all the valid links on this page
            List<string> links = GetLinks(doc);

            foreach (string href in links)
            {
                Uri uri = CrawlUtil.ConvertToUri(_root, href);
                if (_root.IsBaseOf(uri))
                {
                    // Add only links that we can follow
                    if (_follow)
                    {
                        // Global local crawlable links
                        if (!_crawlUrls.Contains(href) && 
                            !_alreadyCrawled.Contains(href))
                            _crawlUrls.Add(href);

                        if (!entry.LocalLinks.Contains(href))
                            entry.LocalLinks.Add(href);
                    }
                }
                else
                {
                    if (_follow)
                    {
                        // Global external crawlable links
                        if (!_externalUrls.Contains(href))
                            _externalUrls.Add(href);

                        if (!entry.ExternalLinks.Contains(href))
                            entry.ExternalLinks.Add(href);
                    }
                }
            }

            // If indexing is enabled, get the content body
            if (_index)
            {
                var body = (from b in doc.DocumentNode.Descendants()
                            where b.Name == "body"
                            select b).FirstOrDefault();

                if (body != null)
                {
                    // Clean the junk tags first
                    foreach (var n in body.ChildNodes.ToArray())
                    {
                        if (JUNK_TAGS.Contains(n.Name))
                            n.Remove();
                    }

                    entry.BodyHtml = body.InnerHtml;
                    entry.BodyText = body.InnerText;
                }
            }

            return entry;
        }



        #region Helper methods

        /// <summary>
        /// Retrieves a given url as a string. So far for robots.txt
        /// </summary>
        private static string GetStringContent(string url)
        {
            string content = "";
            using (CrawlClient client = new CrawlClient())
            {
                client.Headers["User-Agent"] = UA;
                client.Timeout = TIMEOUT;

                try
                {
                    content = client.DownloadString(url);
                }
                catch (WebException ex)
                {
                    if (!url.EndsWith("robots.txt"))
                        _failedUrls.Add(url, CrawlUtil.GetException(ex));
                }
            }

            return content;
        }

        /// <summary>
        /// This function is courtesy of R.Reid with very little modification
        /// http://www.strictly-software.com/robotstxt
        /// </summary>
        private static void ParseRobots(string url)
        {
            Uri curr = new Uri(url);

            // Get the robots.txt file from the root domain
            string content = GetStringContent("http://" + curr.Authority + "/robots.txt");

            if (!string.IsNullOrEmpty(content))
            {
                Console.WriteLine("Robots.txt found");

                string[] lines = content.Split(
                    Environment.NewLine.ToCharArray(),
                    StringSplitOptions.RemoveEmptyEntries);

                bool apply = false;
                foreach (string line in lines)
                {
                    RobotCommand cmd = new RobotCommand(line);

                    switch (cmd.Command)
                    {
                        case "COMMENT":
                            break;
                        case "user-agent":
                            if (cmd.UserAgent.IndexOf("*") >= 0 ||
                                cmd.UserAgent.IndexOf("tau cephei") >= 0 ||
                                cmd.UserAgent.IndexOf("taucephei") >= 0)
                                apply = true;
                            break;
                        case "disallow":
                            if (apply)
                            {
                                if (cmd.Url.Length > 0)
                                {
                                    string c = cmd.Url.ToLower();
                                    if (!_failedUrls.ContainsKey(c))
                                        _failedUrls.Add(c, "Disallowed in robots.txt");
                                }
                            }
                            break;
                        case "allow":
                            break;
                    }
                }
            }
            else
            {
                Console.WriteLine("No robots.txt found");
            }
            _robotsParsed = true;
        }

        /// <summary>
        /// Gets a list of links in the given HtmlDocument
        /// </summary>
        private static List<string> GetLinks(HtmlDocument doc)
        {
            List<string> links = new List<string>();

            // Blank links are probably not meant for bots
            var list = (from l in doc.DocumentNode.Descendants()
                        where
                            l.Name == "a" &&
                            l.InnerText != null &&
                            l.Attributes["href"] != null
                        select l);

            // Parse through all the links
            foreach (var l in list)
            {
                if (l.Attributes["rel"] != null)
                {
                    string rel = l.Attributes["rel"].Value;
                    if (!String.IsNullOrEmpty(rel))
                    {
                        if (rel.ToLower().Trim().IndexOf("nofollow") >= 0)
                            continue; // No need to index no follow links
                    }
                }

                // Find remaining links
                string href = l.Attributes["href"].Value;
                if (!String.IsNullOrEmpty(href))
                {
                    href = href.ToLower().Trim();
                    Uri uri = GetCleanUri(href);
                    if (uri != null)
                    {
                        // Get everything up to the querystring (ignores # etc..)
                        links.Add(uri.GetLeftPart(UriPartial.Query));
                    }
                }
            }

            return links;
        }

        /// <summary>
        /// Checks the validity of a given link
        /// </summary>
        /// <param name="l">Link</param>
        /// <returns>True if crawlable. Defaults to true.</returns>
        private static Uri GetCleanUri(string l)
        {
            bool result = true;
            Uri uri = null;
            StringComparison cmp = StringComparison.InvariantCultureIgnoreCase;

            for (int i = 0; i < IGNORE_START.Length; i++)
            {
                if (l.StartsWith(IGNORE_START[i], cmp))
                    result = false;
            }

            for (int i = 0; i < IGNORE_END.Length; i++)
            {
                if (l.EndsWith(IGNORE_END[i], cmp))
                    result = false;
            }

            if (l.IndexOf("=http") > 0) { result = false; }

            // If we didn't fail yet...
            if (result)
            {
                uri = CrawlUtil.ConvertToUri(_root, l);
                for (int i = 0; i < WHITE_LIST.Length; i++)
                {
                    if (uri.IsFile &&
                        !uri.AbsolutePath.EndsWith(WHITE_LIST[i], cmp))
                        result = false;
                }
            }

            if (result)
                return uri;

            return null;
        }

        /// <summary>
        /// Check if the content type matches the constants
        /// </summary>
        private static bool IsValidContentType(string ct)
        {
            foreach (string c in CONTENT_TYPES)
            {
                if (ct.StartsWith(c)) return true;
            }

            return false;
        }

        /// <summary>
        /// Checks if the given Url isn't part of the blocked list.
        /// This function is also courtesy of R. Reid
        /// http://www.strictly-software.com/robotstxt
        /// </summary>
        private static bool UrlIsAllowed(string url)
        {
            // If there are no failed URLs, then we can continue
            if (_failedUrls.Count == 0) return true;

            bool result = true;
            Uri uri = CrawlUtil.ConvertToUri(_root, url);
            url = uri.AbsolutePath.ToLower();

            if (url == "/robots.txt")
            {
                result = false;
            }
            else
            {
                foreach (KeyValuePair<string, string> entry in _failedUrls)
                {
                    if (url.Length >= entry.Key.Length)
                        if (url.Substring(0, entry.Key.Length) == entry.Key)
                            result = false;
                }
            }

            return result;
        }

        #endregion
    }
}

Onward to Program.cs —–>
Finally, we have the main Program.cs file for the console app. I would create the Spider() class and run it in a seperate thread each time the Crawl function is called from within the Program class

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;

using TauCephei.Helpers;
using TauCephei.Crawler;

namespace TauCephei
{
    class Program
    {
        static void Main(string[] args)
        {
            Console.Title = "Tau Cephei";

            Spider spider = new Spider();

            spider.Crawl("http://google.com");

            // Let's pull up what we have from that...
            List<string> NewCrawlUrls = spider.CrawlUrls;
            List<string> NewExternalUrls = spider.ExternalUrls;
            Dictionary<string, string> FailedUrls = spider.FailedUrls;

            // This is what you may want to save to a database
            List<Entry> Entries = spider.CrawledEntries;

            Console.ReadLine();
        }
    }
}

And that should be that…

Enjoy!

About these ads

3 thoughts on “A C# web crawler with page details

  1. Pingback: Web Crawler - USEFUL PORTAL – USEFUL PORTAL

  2. nice solution, but after publishing on server, we get an issue like, after some set of calls, google engine will ask you for Network traffic captcha window, where you have to enter your captha which shown in image, then only you can able to view your results.
    Is there a way we can bypass that captcha, and directly view the results.

    Please provide me some solution for this problem.
    Raj Mouli(mouli.raji@gmail.com)

    • Hi Raj,

      Unfortunately, there’s no direct or simple way to do that as CAPTCHAs are specifically designed to make sure you’re not a crawler or bot ;)

      There are ways around that, but they go in to optical character recognition and a whole heap of programming that go beyond the basic crawler. Maybe I’ll do a post about it in the future, but in the meantime, you can look into “CAPTCHA auto fill” or something similar.

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s