A C# web crawler with page details

This class is pretty much the main crawler engine. This was orginally written in one file, but I decided to split off some of the extraneous functionality to the CrawlerUtil class…

Here is Spider.cs:

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Net;
using System.IO;

using TauCephei.Helpers;
using HtmlAgilityPack;

namespace TauCephei.Crawler
{
    public sealed class Spider
    {
        // User agent string
        const string UA =
            "Mozilla/5.0 (compatible; Tau Cephei; +http://eksith.wordpress.com/2011/06/12/web-crawler/)";

        // Allowed file extensions to crawl
        private static string[] WHITE_LIST = 
        { "htm", "html", "xhtml", "xht", "php", "jsp", "asp", "aspx", "pl", "py", "rb", "cgi" };

        // Only these content types will be indexed
        private static string[] CONTENT_TYPES = 
        { "text/plain", "text/html", "text/xhtml", "application/xhtml+xml" };

        // Binary and non-text content (Iframe crawling in the future)
        private static string[] JUNK_TAGS = { "script", "style", "video", "embed", "object", "iframe" };

        // In page anchors and helpers
        private static string[] IGNORE_START = 
        { "#", "javascript:", "mailto:", "skype:" };

        // This was because I was testing this on wordpress.com
        private static string[] IGNORE_END = 
        { "?share=digg", "?share=facebook", "?share=stumbleupon", "?share=reddit", "?share=email" };

        const int DEPTH = 2;            // Maximum crawl depth per Uri
        const int TIMEOUT = 1000;        // Timeout per request
        const int REDIRECTS = 3;        // Maximum number of redirects allowed
        const int HEADERS = 8;            // Maximum headers
        const int CONTENT = 80000;        // Maximum content length

        // Webmasters with bandwith limits will thank you for this...
        const int MAX_PAGES = 30;        // Maximum pages per crawl

        // Global external crawl links
        private List<string> _externalUrls = new List<string>();

        // Global local crawl links
        private List<string> _crawlUrls = new List<string>();

        // Global crawl tracker
        private List<string> _alreadyCrawled = new List<string>();

        // Global depth tracker
        private int _crawlDepth = 0;

        // Global iteration tracker
        private int _iteration = 0;

        // Global max pages tracker
        private int _maxIteration = 0;

        // Uncrawlable or invalid links
        private static Dictionary<string, string> _failedUrls = 
            new Dictionary<string, string>();

        // Robots flag for this URI
        private static bool _robotsParsed = false;

        // Main authority
        private static Uri _root;


        public List<Entry> _crawledEntries = new List<Entry>();
        public List<Entry> CrawledEntries
        {
            get { return _crawledEntries; }
        }

        public Dictionary<string, string> FailedUrls
        {
            get { return _failedUrls; }
        }

        public List<string> CrawlUrls
        {
            get { return _crawlUrls; }
        }

        public List<string> ExternalUrls
        {
            get { return _externalUrls; }
        }

        public List<string> AlreadyCrawled
        {
            get { return _alreadyCrawled; }
        }

        // Constructor
        public Spider()
        {
            _crawlDepth = DEPTH;
            _maxIteration = MAX_PAGES;
        }

        /// <summary>
        /// Crawls a given url
        /// </summary>
        /// <param name="url"></param>
        public void Crawl(string url)
        {
            _root = CrawlUtil.ConvertToUri(_root, url);
            url = _root.GetLeftPart(UriPartial.Query);

            if (_alreadyCrawled.Contains(url))
            {
                _crawlUrls.Remove(url);
            }
            else
            {
                // Check if robots.txt was parsed for this URL
                if (!_robotsParsed)
                    ParseRobots(url);

                // Add the current url to the crawled tracker
                _alreadyCrawled.Add(url);
                Entry entry = GetEntry(GetContent(_root));

                if (entry != null)
                {
                    // Set entry parameters
                    entry.Id = Util.GetChecksum(url, "sha1");
                    entry.Url = url;
                    entry.LastCrawl = DateTime.Now;


                    Console.WriteLine("Entry created...");
                    Console.WriteLine(entry.Id);
                    Console.WriteLine(entry.Title);
                    Console.WriteLine("Iteration : " + _iteration +
                        " | Depth : " + _crawlDepth +" | Pages remaining : "+ 
                        _maxIteration);
                    Console.WriteLine("\n\n");

                    // We can now save this entry or do something else.
                    // I just added this to a list
                    _crawledEntries.Add(entry);

                    // Continue crawling for as long as we have depth
                    while (_crawlDepth > 0)
                    {
                        if (_iteration <= 0)
                            _iteration = _crawlUrls.Count;

                        while (_iteration > 0)
                        {
                            if (_crawlUrls.Count > 0 && _maxIteration > 0)
                            {
                                _maxIteration--;
                                Crawl(_crawlUrls[0]);
                            }
                            _iteration--;
                        }

                        _crawlDepth--;
                    }

                }
            }
        }

        /// <summary>
        /// Downloads the given uri and returns HtmlDocument
        /// </summary>
        private HtmlDocument GetContent(Uri uri)
        {
            HtmlDocument doc = new HtmlDocument();
            HttpWebRequest req = 
                (HttpWebRequest)WebRequest.Create(uri.AbsoluteUri);

            //Following options were iffy on HtmlAgilityPack
            //doc.OptionFixNestedTags = false;
            //doc.OptionAutoCloseOnEnd = false;

            // Set the request properties
            req.UserAgent = UA;
            req.Timeout = TIMEOUT;
            req.MaximumAutomaticRedirections = REDIRECTS;
            req.MaximumResponseHeadersLength = HEADERS;

            try
            {
                HttpWebResponse resp = (HttpWebResponse)req.GetResponse();

                // If the content type is readable and isn't too large
                if (IsValidContentType(resp.ContentType) && 
                    resp.ContentLength <= CONTENT)
                    doc.Load(resp.GetResponseStream());
                else
                    _failedUrls.Add(uri.AbsoluteUri,
                        "Invalid content type or content size too large");
            }
            catch (WebException ex)
            {
                _failedUrls.Add(uri.AbsoluteUri,
                    CrawlUtil.GetException(ex));
            }

            return doc;
        }

        /// <summary>
        /// Parse and retrieves the page as an Entry object
        /// </summary>
        /// <param name="doc">HtmlDocument to parse</param>
        /// <returns>Entry object</returns>
        private Entry GetEntry(HtmlDocument doc)
        {
            bool _index = true;
            bool _follow = true;

            Entry entry = new Entry();

            // Parse the meta tags
            entry.Meta = CrawlUtil.GetMetaTags(doc);

            // If the robots tag is defined
            string r = CrawlUtil.GetMetaKey(entry.Meta, "robots", "index,follow");

            r = r.ToLower().Trim();

            if (r.LastIndexOf("nofollow") >= 0)
                _follow = false;
            if (r.LastIndexOf("noindex") >= 0)
                _index = false;

            // If we can't follow or index the page, then move on
            if (!_follow && !_index)
                return null;

            // Get the document description info
            entry.Title = CrawlUtil.GetTagValue(doc, "title");
            entry.Description = CrawlUtil.GetMetaKey(entry.Meta, "description", "");

            // Give extra space for the abstract
            entry.Abstract = CrawlUtil.GetMetaKey(entry.Meta, "abstract", "", 500);

            entry.Author = CrawlUtil.GetMetaKey(entry.Meta, "author", "");
            entry.Copyright = CrawlUtil.GetMetaKey(entry.Meta, "copyright", "");

            // A little hesitant to use this due to the abuse potential
            if (entry.Meta.ContainsKey("keywords"))
                entry.Keywords = Util.GetKeywords(entry.Meta["keywords"], 6, true);

            if (entry.Meta.ContainsKey("content-type"))
                entry.Encoding = CrawlUtil.GetEncoding(entry.Meta["content-type"]);

            // Get all the valid links on this page
            List<string> links = GetLinks(doc);

            foreach (string href in links)
            {
                Uri uri = CrawlUtil.ConvertToUri(_root, href);
                if (_root.IsBaseOf(uri))
                {
                    // Add only links that we can follow
                    if (_follow)
                    {
                        // Global local crawlable links
                        if (!_crawlUrls.Contains(href) && 
                            !_alreadyCrawled.Contains(href))
                            _crawlUrls.Add(href);

                        if (!entry.LocalLinks.Contains(href))
                            entry.LocalLinks.Add(href);
                    }
                }
                else
                {
                    if (_follow)
                    {
                        // Global external crawlable links
                        if (!_externalUrls.Contains(href))
                            _externalUrls.Add(href);

                        if (!entry.ExternalLinks.Contains(href))
                            entry.ExternalLinks.Add(href);
                    }
                }
            }

            // If indexing is enabled, get the content body
            if (_index)
            {
                var body = (from b in doc.DocumentNode.Descendants()
                            where b.Name == "body"
                            select b).FirstOrDefault();

                if (body != null)
                {
                    // Clean the junk tags first
                    foreach (var n in body.ChildNodes.ToArray())
                    {
                        if (JUNK_TAGS.Contains(n.Name))
                            n.Remove();
                    }

                    entry.BodyHtml = body.InnerHtml;
                    entry.BodyText = body.InnerText;
                }
            }

            return entry;
        }



        #region Helper methods

        /// <summary>
        /// Retrieves a given url as a string. So far for robots.txt
        /// </summary>
        private static string GetStringContent(string url)
        {
            string content = "";
            using (CrawlClient client = new CrawlClient())
            {
                client.Headers["User-Agent"] = UA;
                client.Timeout = TIMEOUT;

                try
                {
                    content = client.DownloadString(url);
                }
                catch (WebException ex)
                {
                    if (!url.EndsWith("robots.txt"))
                        _failedUrls.Add(url, CrawlUtil.GetException(ex));
                }
            }

            return content;
        }

        /// <summary>
        /// This function is courtesy of R.Reid with very little modification
        /// http://www.strictly-software.com/robotstxt
        /// </summary>
        private static void ParseRobots(string url)
        {
            Uri curr = new Uri(url);

            // Get the robots.txt file from the root domain
            string content = GetStringContent("http://" + curr.Authority + "/robots.txt");

            if (!string.IsNullOrEmpty(content))
            {
                Console.WriteLine("Robots.txt found");

                string[] lines = content.Split(
                    Environment.NewLine.ToCharArray(),
                    StringSplitOptions.RemoveEmptyEntries);

                bool apply = false;
                foreach (string line in lines)
                {
                    RobotCommand cmd = new RobotCommand(line);

                    switch (cmd.Command)
                    {
                        case "COMMENT":
                            break;
                        case "user-agent":
                            if (cmd.UserAgent.IndexOf("*") >= 0 ||
                                cmd.UserAgent.IndexOf("tau cephei") >= 0 ||
                                cmd.UserAgent.IndexOf("taucephei") >= 0)
                                apply = true;
                            break;
                        case "disallow":
                            if (apply)
                            {
                                if (cmd.Url.Length > 0)
                                {
                                    string c = cmd.Url.ToLower();
                                    if (!_failedUrls.ContainsKey(c))
                                        _failedUrls.Add(c, "Disallowed in robots.txt");
                                }
                            }
                            break;
                        case "allow":
                            break;
                    }
                }
            }
            else
            {
                Console.WriteLine("No robots.txt found");
            }
            _robotsParsed = true;
        }

        /// <summary>
        /// Gets a list of links in the given HtmlDocument
        /// </summary>
        private static List<string> GetLinks(HtmlDocument doc)
        {
            List<string> links = new List<string>();

            // Blank links are probably not meant for bots
            var list = (from l in doc.DocumentNode.Descendants()
                        where
                            l.Name == "a" &&
                            l.InnerText != null &&
                            l.Attributes["href"] != null
                        select l);

            // Parse through all the links
            foreach (var l in list)
            {
                if (l.Attributes["rel"] != null)
                {
                    string rel = l.Attributes["rel"].Value;
                    if (!String.IsNullOrEmpty(rel))
                    {
                        if (rel.ToLower().Trim().IndexOf("nofollow") >= 0)
                            continue; // No need to index no follow links
                    }
                }

                // Find remaining links
                string href = l.Attributes["href"].Value;
                if (!String.IsNullOrEmpty(href))
                {
                    href = href.ToLower().Trim();
                    Uri uri = GetCleanUri(href);
                    if (uri != null)
                    {
                        // Get everything up to the querystring (ignores # etc..)
                        links.Add(uri.GetLeftPart(UriPartial.Query));
                    }
                }
            }

            return links;
        }

        /// <summary>
        /// Checks the validity of a given link
        /// </summary>
        /// <param name="l">Link</param>
        /// <returns>True if crawlable. Defaults to true.</returns>
        private static Uri GetCleanUri(string l)
        {
            bool result = true;
            Uri uri = null;
            StringComparison cmp = StringComparison.InvariantCultureIgnoreCase;

            for (int i = 0; i < IGNORE_START.Length; i++)
            {
                if (l.StartsWith(IGNORE_START[i], cmp))
                    result = false;
            }

            for (int i = 0; i < IGNORE_END.Length; i++)
            {
                if (l.EndsWith(IGNORE_END[i], cmp))
                    result = false;
            }

            if (l.IndexOf("=http") > 0) { result = false; }

            // If we didn't fail yet...
            if (result)
            {
                uri = CrawlUtil.ConvertToUri(_root, l);
                for (int i = 0; i < WHITE_LIST.Length; i++)
                {
                    if (uri.IsFile &&
                        !uri.AbsolutePath.EndsWith(WHITE_LIST[i], cmp))
                        result = false;
                }
            }

            if (result)
                return uri;

            return null;
        }

        /// <summary>
        /// Check if the content type matches the constants
        /// </summary>
        private static bool IsValidContentType(string ct)
        {
            foreach (string c in CONTENT_TYPES)
            {
                if (ct.StartsWith(c)) return true;
            }

            return false;
        }

        /// <summary>
        /// Checks if the given Url isn't part of the blocked list.
        /// This function is also courtesy of R. Reid
        /// http://www.strictly-software.com/robotstxt
        /// </summary>
        private static bool UrlIsAllowed(string url)
        {
            // If there are no failed URLs, then we can continue
            if (_failedUrls.Count == 0) return true;

            bool result = true;
            Uri uri = CrawlUtil.ConvertToUri(_root, url);
            url = uri.AbsolutePath.ToLower();

            if (url == "/robots.txt")
            {
                result = false;
            }
            else
            {
                foreach (KeyValuePair<string, string> entry in _failedUrls)
                {
                    if (url.Length >= entry.Key.Length)
                        if (url.Substring(0, entry.Key.Length) == entry.Key)
                            result = false;
                }
            }

            return result;
        }

        #endregion
    }
}

Onward to Program.cs —–>
Finally, we have the main Program.cs file for the console app. I would create the Spider() class and run it in a seperate thread each time the Crawl function is called from within the Program class

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;

using TauCephei.Helpers;
using TauCephei.Crawler;

namespace TauCephei
{
    class Program
    {
        static void Main(string[] args)
        {
            Console.Title = "Tau Cephei";

            Spider spider = new Spider();

            spider.Crawl("http://google.com");

            // Let's pull up what we have from that...
            List<string> NewCrawlUrls = spider.CrawlUrls;
            List<string> NewExternalUrls = spider.ExternalUrls;
            Dictionary<string, string> FailedUrls = spider.FailedUrls;

            // This is what you may want to save to a database
            List<Entry> Entries = spider.CrawledEntries;

            Console.ReadLine();
        }
    }
}

And that should be that…

Enjoy!

About these ads

One thought on “A C# web crawler with page details

  1. Pingback: Web Crawler - USEFUL PORTAL – USEFUL PORTAL

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Connecting to %s