A C# web crawler with page details

using System;
using System.Collections;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Security;
using System.Security.Cryptography;
using System.IO;

namespace TauCephei.Helpers
{
    public static class Util
    {
        /// <summary>
        /// Gets the checksum of a local file or some text.
        /// </summary>
        /// <param name="source">Path to a file or a string</param>
        /// <param name="mode">Checksum mode in sha1, sha256, sha512 or md5 (default)</param>
        /// <param name="isFile">True if file mode or false for text mode (defaults to false)</param>
        /// <returns>Completed checksum</returns>
        public static string GetChecksum(string source, string mode = "md5", bool isFile = false)
        {
            byte[] bytes = { };
            Stream fs;

            if (isFile)
                fs = new BufferedStream(File.OpenRead(source), 120000);
            else
                fs = new MemoryStream(Encoding.UTF8.GetBytes(source));

            switch (mode.ToLower())
            {
                case "sha1":
                    using (SHA1CryptoServiceProvider sha1 = 
                        new SHA1CryptoServiceProvider())
                        bytes = sha1.ComputeHash(fs);
                    break;

                case "sha256":
                    using (SHA256CryptoServiceProvider sha256 = 
                        new SHA256CryptoServiceProvider())
                        bytes = sha256.ComputeHash(fs);
                    break;

                case "sha512":
                    using (SHA512CryptoServiceProvider sha512 = 
                        new SHA512CryptoServiceProvider())
                        bytes = sha512.ComputeHash(fs);
                    break;

                case "md5":
                default:
                    using (MD5CryptoServiceProvider md5 = 
                        new MD5CryptoServiceProvider())
                        bytes = md5.ComputeHash(fs);
                    break;
            }

            // Cleanup
            fs.Close();
            fs = null;

            return BitConverter
                    .ToString(bytes)
                    .Replace("-", "")
                    .ToLower();
        }

        /// <summary>
        /// Checks if items in one array are contained in the toher
        /// </summary>
        /// <param name="e">Container array</param>
        /// <param name="m">search array</param>
        /// <returns></returns>
        public static bool MatchExists(string[] e, string[] m)
        {
            return e.Any(g => m.Contains(g));
        }


        /// <summary>
        /// Checks whether string has value or sets default it doesn't or is at 0
        /// </summary>
        public static int DefaultInt(string val, int d, int? min)
        {
            int tmp = 0;

            if (!Int32.TryParse(val, out tmp))
                tmp = d;

            if (min.HasValue)
                if (tmp <= min.Value) tmp = d;

            return tmp;
        }

        /// Checks whether nullable string has value or sets default it doesn't or is at empty
        /// </summary>
        public static string DefaultString(string val, string d)
        {
            if (string.IsNullOrEmpty(val)) val = d;

            val.Replace("\r", Environment.NewLine)
                .Replace("\n", Environment.NewLine);

            return val;
        }

        /// <summary>
        /// Returns a flat (no line breaks) string or a default value if empty
        /// </summary>
        public static string DefaultFlatString(string val, string d, int l = 255)
        {
            string ret = Util.DefaultString(val, d).Replace(Environment.NewLine, "");

            if ((ret.Length > 0) && (ret.Length > l))
                ret = ret.Substring(0, l - 1);

            return ret;
        }

        /// <summary>
        /// Gets an array of cleaned keywords
        /// </summary>
        /// <param name="txt">A comma delimited string of keywords</param>
        /// <param name="limit">Limit s the number of tags returned</param>
        /// <param name="tolower">Optional parameter to convert the text to lowercase</param>
        /// <returns>Array of cleaned keywords</returns>
        public static List<string> GetKeywords(string txt, int limit, bool tolower = true)
        {
            string[] tags = txt.Split(',');
            List<string> clean = new List<string>();

            for (int i = 0; i < tags.Length; i++)
            {
                tags[i] = Util.DefaultFlatString(tags[i], "");

                if (!String.IsNullOrEmpty(tags[i]))
                {
                    if (tolower)
                        clean.Add(tags[i].ToLower());
                    else
                        clean.Add(tags[i]);
                }
            }

            return clean;
        }
    }
}

You may notice the GetChecksum function is identical to the one in my previous post. I was just waiting for an excuse to use it ;) . Some of the other functions like GetDefaultInt aren’t used, but they are there in case it will come in handy for parsing more types of meta tags and such.

Next up is a simple WebClient class wrapper used to introduce a Timeout limit for requests. The HttpWebRequest class has a Timeout property, but since WebClient hides this, we’ll need this class.

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Net;

namespace TauCephei.Crawler
{
    public class CrawlClient : WebClient
    {
        private int _timeout = 50000;
        public int Timeout
        {
            get { return _timeout; }
            set { _timeout = (value >= 100) ? value : 50000; }
        }
        
        protected override WebRequest GetWebRequest(Uri address)
        {
            var result = base.GetWebRequest(address);
            result.Timeout = this._timeout;
            
            return result;
        }
    }
}

And this is R. Reid’s Robotcommand class. Very little modification here…

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;

// This class is courtesy of R.Reid 
// http://www.strictly-software.com/robotstxt
namespace TauCephei.Crawler
{
    /*
     * This class will take a line from a Robots.txt
     * and parse it to return a command either
     * COMMENT - commented out line
     * USER-AGENT - the name of the useragent to apply the rule to
     * DISALLOW - command to disallow an agent to a Uri
     * ALLOW - Only used by Google - reverse of DISALLOW
     * and also a URL
     */

    public class RobotCommand
    {
        private static string _Command;
        private static string _Url = string.Empty;
        private static string _Useragent = string.Empty;

        /*
         * Will convert a robots.txt line [command]:[rule] into either
         * command = user-agent AND useragent = Googlebot
         * OR
         * command = DISALLOW AND URL = /search
         */

        public RobotCommand(string commandline)
        {
            int PosOfComment = commandline.IndexOf('#');
            if (PosOfComment == 0)
            {
                // whole line is a comment
                _Command = "COMMENT";
            }
            else
            {
                // there is a comment on the line so remove it
                if (PosOfComment >= 0)
                {
                    commandline = commandline.Substring(0, PosOfComment);
                }
                // now if we have an instruction
                if (commandline.Length > 0)
                {
                    /* 
                     * split our line on : e.g turn User-agent: GoogleBot
                     * into _Command = User-agent and _Url = GoogleBot                         
                     */
                    string[] lineArray = commandline.Split(':');
                    _Command = lineArray[0].Trim().ToLower();
                    if (lineArray.Length > 1)
                    {
                        // set appropriate property depending on command type
                        if (_Command == "user-agent")
                        {
                            _Useragent = lineArray[1].ToLower().Trim();
                        }
                        else
                        {
                            _Url = lineArray[1].Trim();
                            // if the URL is a full URL e.g sitemaps then it will contain
                            // a : so add to URL
                            if (lineArray.Length > 2)
                            {
                                _Url += ":" + lineArray[2].Trim();
                            }
                        }

                    }
                }
            }
        }

        public string Command
        {
            get { return _Command; }
        }

        public string Url
        {
            get { return _Url; }
        }

        public string UserAgent
        {
            get { return _Useragent; }
        }
    }
}

Onward to Spider.cs —->

About these ads

3 thoughts on “A C# web crawler with page details

  1. Pingback: Web Crawler - USEFUL PORTAL – USEFUL PORTAL

  2. nice solution, but after publishing on server, we get an issue like, after some set of calls, google engine will ask you for Network traffic captcha window, where you have to enter your captha which shown in image, then only you can able to view your results.
    Is there a way we can bypass that captcha, and directly view the results.

    Please provide me some solution for this problem.
    Raj Mouli(mouli.raji@gmail.com)

    • Hi Raj,

      Unfortunately, there’s no direct or simple way to do that as CAPTCHAs are specifically designed to make sure you’re not a crawler or bot ;)

      There are ways around that, but they go in to optical character recognition and a whole heap of programming that go beyond the basic crawler. Maybe I’ll do a post about it in the future, but in the meantime, you can look into “CAPTCHA auto fill” or something similar.

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s