< Back

How to crawl and scrape websites using C#?

Crawl

This write-up delves into the intricacies of developing a web crawler in C# programming language for the purpose of web page scraping. The web crawler effectively navigates through diverse websites, identifies links to additional pages, and subsequently extracts desired data from these pages. Through this particular demonstration, the web crawler is applied to extract hotel details such as names and prices from a travel website.

Firstly, you will require Visual Studio Community Edition 2019 or a more recent version integrated with C#. Upon installation of Visual Studio, launch the application and select the option "Create a new project". When prompted to select a template, opt for the "Console App (.NET Core)" template. Assign a suitable name to your project and designate the desired storage location. After the project is successfully initialized, proceed to open it and access the "Program.cs" file.

Here’s the full code of the web crawler in C#:

using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Linq;
using System.Net;
using System.Net.Http;
using System.Threading.Tasks;
using System.Xml;
using HtmlAgilityPack;

namespace Crawler
{
   class Crawler
   {
       private IDictionary<string, byte> visited = new ConcurrentDictionary<string, byte>();
       private IDictionary<string, byte> hosts = new ConcurrentDictionary<string, byte>();
       private ConcurrentQueue<Link> queue;
       private HttpClient client;

       // settings
       private int maxDepth = 0;
       private int maxSites = 1;

       // maxSitesConstraint returns true if we have to skip the given link
       private bool maxSitesConstraint(string e)
       {
           var uri = new Uri(e);
           if (!hosts.ContainsKey(uri.Host))
           {
               if (hosts.Count() < maxSites)
               {
                   hosts[uri.Host] = 0;
               }
               else
               {
                   return true;
               }
           }
           return false;
       }

       private void scrapData(HtmlDocument doc)
       {
           var cards = doc.DocumentNode.SelectNodes(
"//div[contains(@class, 'uitk-card uitk-card-roundcorner-all')]");
           if (cards != null) foreach (HtmlNode node in cards)
               {
                   var n1 = node.SelectNodes(
.//div[contains(@class, 'uitk-card-content-section')]/div/div/h4[contains(@class, 'uitk-heading')]");
                   if (n1 != null) foreach (var nn1 in n1)
                       {
                           Console.WriteLine(string.Format("label> {0}", nn1.InnerText));
                       }

                   var n2 = node.SelectNodes
(".//span/div[contains(@class, 'uitk-text.uitk-type-600')]");
                   if (n2 != null) foreach (var nn in n2)
                       {
                           Console.WriteLine(string.Format("price> {0}", nn.InnerText));
                       }

                   var n3 = node.SelectNodes(".//div[contains(@class, 'uitk-price-lockup')]/section/span[contains(@class, 'uitk-lockup-price')]");
                   if (n3 != null) foreach (var nn in n3)
                       {
                           Console.WriteLine(string.Format("price> {0}", nn.InnerText));
                       }
               }
       }

       private async Task<ISet<string>> collectLinks(string link)
       {
           var newLinks = new HashSet<string>();

           var s = await client.GetStringAsync(link);
           var doc = new HtmlDocument();
           doc.LoadHtml(s);

           scrapData(doc);

           var nodes = doc.DocumentNode.SelectNodes("//a[@href]");
           if (nodes != null)
           {
               foreach (HtmlNode node in nodes)
               {
                   var v = node.GetAttributeValue("href", "");
                   try
                   {
                       var u = new Uri(v);
                       newLinks.Add(v);
                   }
                   catch { }
               }
           }
           return newLinks;
       }

       // Crawl a given site using breadth-first search algorithm
       private async Task TaskHandler(Link j)
       {
           Console.WriteLine(string.Format("visit> {1} {0}", j.uri, j.depth));

           var list = await collectLinks(j.uri);
           foreach (var e in list)
           {
               if (!visited.ContainsKey(e))
               {
                   if (maxSitesConstraint(e))
                   {
                       continue;
                   }
                   if (j.depth + 1 <= maxDepth)
                   {
                       var newJob = new Link(e, j.depth + 1);
                       visited[e] = 0;

                       queue.Enqueue(newJob);
                   }
               }
           }
       }

       // _maxDepth - maximum depth of walk tree
       // _maxSites - maximum number of sites to crawl, including an initail address
       public async Task Start(string u, int _maxDepth, int _maxSites)
       {
           maxDepth = _maxDepth;
           maxSites = _maxSites;

           var httpClientHandler = new HttpClientHandler
           {
               Proxy = new WebProxy
               {
                   Address = new Uri("http://localhost:8080"),
                   BypassProxyOnLocal = false,
                   UseDefaultCredentials = false,
               }
           };
           client = new HttpClient(handler: httpClientHandler, disposeHandler: true);

           var maxThreads = 8;
           queue = new ConcurrentQueue<Link>();
           queue.Enqueue(new Link(u, 0));

           var tasks = new List<Task>();
           for (int n = 0; n < maxThreads; n++)
           {
               tasks.Add(Task.Run(async () =>
               {
                   while (queue.TryDequeue(out Link l))
                   {
                       await RetryHelper.RetryOnExceptionAsync(5, TimeSpan.FromSeconds(5), async () => {
                           await TaskHandler(l);
                       });
                   }
               }));
           }
           await Task.WhenAll(tasks);
       }
   }

   class Program
   {
       static void Main(string[] args)
       {

           var c = new Crawler();

           try
           {
               var uri = "https://www.expedia.com/Hotel-Search?adults=2&destination=Tbilisi%2C%20Georgia&rooms=1";
               var maxDepth = 0;
               var maxSites = 1;
               // 0 - is a maximum depth of walk tree
               // 1 - maximum number of sites to crawl, including an initail address

               c.Start(uri, maxDepth, maxSites)
                   .Wait();

           }
           catch (Exception e)
           {
               Console.WriteLine(e.Message);
           }
       }
   }
}

If you’d like to extract different data you would need to change the URL in ‘var uri’ and ‘scrapData’(HtmlDocument doc)’ function, as it is used to extract and print data from the HTML document using the HtmlAgilityPack library.

HtmlAgilityPack is not added by default, so to do that, you will need to right click on your project and select “Manage NuGet Packages…”. Now in the search enter HtmlAgilityPack, pick one of the packages and install it. Once that is done, right click on your project’s name one more time and select “Add” -> “New Class” and add an “Empty Class”. Rename it to “Utils.cs”

Now paste this code into there:

using System;
using System.Threading.Tasks;

namespace Crawler
{
   class Link
   {
       public string uri;
       public int depth;

       public Link(string uri, int depth)
       {
           this.uri = uri;
           this.depth = depth;
       }
   }

   public static class RetryHelper
   {
       public static async Task RetryOnExceptionAsync(
           int times, TimeSpan delay, Func<Task> operation)
       {
           if (times <= 0)
               throw new ArgumentOutOfRangeException(nameof(times));

           var attempts = 0;
           do
           {
               try
               {
                   attempts++;
                   await operation();
                   break;
               }
               catch
               {
                   Console.WriteLine
($"Exception on attempt {attempts} of {times}. Will retry after sleeping for {delay}.");
                   if (attempts == times)
                       throw;

                   await Task.Delay(delay);
               }
           } while (true);
       }
   }
}

This code creates two new classes - "Link" will represent a link in web crawler and "RetryHelper" that offers a retry mechanism for asynchronous operations.

Now create one more file. Again, right click on your project’s name and select to “Add” -> “New file”, select XML file and Empty XML file and rename it to “Crawler-MultiThreaded.csproj”.

This file will serve as a configuration file for the C# project, defining its properties, references, and source code files necessary for building and compiling the project.
Paste this code into XML file:

<Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props"
Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')" />
 <PropertyGroup>
   <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
   <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
   <ProjectGuid>{38B7DCED-0C40-4FA6-BB32-DC6D0EC28A12}</ProjectGuid>
   <OutputType>Exe</OutputType>
   <RootNamespace>Crawler_MultiThreaded</RootNamespace>
   <AssemblyName>Crawler-MultiThreaded</AssemblyName>
   <TargetFrameworkVersion>v4.7.2</TargetFrameworkVersion>
   <FileAlignment>512</FileAlignment>
   <AutoGenerateBindingRedirects>true</AutoGenerateBindingRedirects>
   <Deterministic>true</Deterministic>
 </PropertyGroup>
 <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
   <PlatformTarget>AnyCPU</PlatformTarget>
   <DebugSymbols>true</DebugSymbols>
   <DebugType>full</DebugType>
   <Optimize>false</Optimize>
   <OutputPath>bin\Debug\</OutputPath>
   <DefineConstants>DEBUG;TRACE</DefineConstants>
   <ErrorReport>prompt</ErrorReport>
   <WarningLevel>4</WarningLevel>
 </PropertyGroup>
 <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
   <PlatformTarget>AnyCPU</PlatformTarget>
   <DebugType>pdbonly</DebugType>
   <Optimize>true</Optimize>
   <OutputPath>bin\Release\</OutputPath>
   <DefineConstants>TRACE</DefineConstants>
   <ErrorReport>prompt</ErrorReport>
   <WarningLevel>4</WarningLevel>
 </PropertyGroup>
 <ItemGroup>
   <Reference Include="HtmlAgilityPack, Version=1.11.46.0, Culture=neutral,
PublicKeyToken=bd319b19eaf3b43a, processorArchitecture=MSIL">
     <HintPath>..\packages\HtmlAgilityPack.1.11.46\lib\Net45\HtmlAgilityPack.dll</HintPath>
   </Reference>
   <Reference Include="System" />
   <Reference Include="System.Core" />
   <Reference Include="System.Xml.Linq" />
   <Reference Include="System.Data.DataSetExtensions" />
   <Reference Include="Microsoft.CSharp" />
   <Reference Include="System.Data" />
   <Reference Include="System.Net.Http" />
   <Reference Include="System.Xml" />
 </ItemGroup>
 <ItemGroup>
   <Compile Include="Utils.cs" />
   <Compile Include="Program.cs" />
   <Compile Include="Properties\AssemblyInfo.cs" />
 </ItemGroup>
 <ItemGroup>
   <None Include="App.config" />
   <None Include="packages.config" />
 </ItemGroup>
 <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />

Now you can save all files.
Congratulations! You’ve created a C# simple crawler.

Before building and running the code, remember to install the Go Simple Tunnel from https://github.com/go-gost/gost/releases or by executing the command 'brew install gost'.
Next, execute the following command to run a proxy tunnel:

gost -L=:8080
-F=https://customer-username-country-us:password@proxy.goproxies.com:10001

Once the tunnel is active, you can reopen the project and proceed with building and running it.
You should see the output like:

visit> 0 https://www.expedia.com/Hotel-Search?adults=2&destination=Tbilisi%2C%20Georgia&rooms=1
label> The Biltmore Hotel Tbilisi
price> $153
label> Shota@Rustaveli Boutique hotel
price> $133
label> Rooms Hotel Tbilisi
price> $141
label> Iota Hotel Tbilisi
price> $98
label> Holiday Inn Tbilisi, an IHG Hotel
price> $102
label> Terrace House Tbilisi
price> $25
label> Moxy Tbilisi
price> $63…

Copywriter

Matas has strong background knowledge of information technology and services, computer and network security. Matas areas of expertise include cybersecurity and related fields, growth, digital, performance, and content marketing, as well as hands-on experience in both the B2B and B2C markets.

FAQ

What Are Rotating Residential Proxies?
Rotating Residential Proxies offer you the best solution for scaling your scraping without getting blocked.

Rotating proxies provide a different IP each time you make a request. With this automated rotation of IPs, you get unlimited scraping without any detection. It provides an extra layer of anonymity and security for higher-demand web scraping needs.

IP addresses change automatically, so after the initial set up you’re ready to scrape as long and much as you need. IPs may shift after a few hours, a few minutes or after each session depending on your configuration. We do this by pulling legitimate residential IPs from our pool.
Why Do You Need Rotating Residential Proxies?
There are a number of use cases for rotating residential proxies. One of the most common ones is bypassing access limitations.

Some websites have specific measures in place to block IP access after a certain number of requests over an extended period of time.

This limits your activity and hinders scalability. With rotating residential IP addresses, it's almost impossible for websites to detect that you are the same user, so you can continue scraping with ease.
When to Use Static Residential Proxies Instead?
There are particular cases where static residential proxies may be more useful for your needs, such as accessing services that require logins.

Rotating IPs might lead to sites not functioning well if they are more optimised for regular use from a single IP.

Learn if our static residential proxies are a better fit for your needs.
Can I choose the IP location by city?
Yes. GoProxies has IPs spread across almost every country and city worldwide.
Can I choose the IP location by country state?
Yes. GoProxies has IPs spread across X countries with localised IPs in every state.

Is C# good for web scraping?

Yes, C# is a good choice for web scraping. It has libraries like HtmlAgilityPack and HttpClient that make it suitable for scraping HTML content from websites.

Is web scraping the same as crawling?

No, web scraping and web crawling are not the same. Web crawling involves systematically navigating through websites and collecting data, often used by search engines to index web content. Web scraping, on the other hand, is focused on extracting specific information from web pages, like product prices or news headlines. Crawling is a broader process that can include scraping as one of its components.

Is Python or C# better for web scraping?

Python is commonly chosen for web scraping because of its extensive range of libraries such as Beautiful Soup and Scrapy, which are specifically developed for this purpose. Although C# can also be utilized for web scraping, Python's user-friendly nature and wide array of tools for scraping make it a more favored option in this particular context.

What’s a Rich Text element?

The rich text element allows you to create and format headings, paragraphs, blockquotes, images, and video all in one place instead of having to add and format them individually. Just double-click and easily create content.

Static and dynamic content editing

A rich text element can be used with static or dynamic content. For static content, just drop it into any page and begin editing. For dynamic content, add a rich text field to any collection and then connect a rich text element to that field in the settings panel. Voila!

How to customize formatting for each rich text

Headings, paragraphs, blockquotes, figures, images, and figure captions can all be styled after a class is added to the rich text element using the "When inside of" nested selector system.

By clicking “Accept All Cookies”, you agree to the storing of cookies on your device to enhance site navigation, analyze site usage, and assist in our marketing efforts. View our Privacy Policy for more information.