< Back

How to crawl and scrape websites using C#?

In this article we will be reviewing how to write a web crawler to scrape web pages using C# language. Web crawler goes to different websites, finds links to other pages, and then looks for specific information on those pages. In this example, the web crawler is used to get information from a travel website about hotels, including their names and prices.

To start with, you will need to have Visual Studio Community Edition 2019 or newer with C#. Once you have Visual Studio installed, open it and click “Create a new project”.When you get prompted to choose a template, you can pick the “Console App (.NET Core)" template. Provide a name for your project and choose the location where you want to save it. Once the project is created, open it, and select the “Program.cs'' file.

Here’s the full code of the web crawler in C#:

using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Linq;
using System.Net;
using System.Net.Http;
using System.Threading.Tasks;
using System.Xml;
using HtmlAgilityPack;
namespace Crawler
{
   class Crawler
   {
       private IDictionary<string, byte> visited = new ConcurrentDictionary<string, byte>();
       private IDictionary<string, byte> hosts = new ConcurrentDictionary<string, byte>();
       private ConcurrentQueue<Link> queue;
       private HttpClient client;
       // settings
       private int maxDepth = 0;
       private int maxSites = 1;
       // maxSitesConstraint returns true if we have to skip the given link
       private bool maxSitesConstraint(string e)
       {
           var uri = new Uri(e);
           if (!hosts.ContainsKey(uri.Host))
           {
               if (hosts.Count() < maxSites)
               {
                   hosts[uri.Host] = 0;
               }
               else
               {
                   return true;
               }
           }
           return false;
       }
       private void scrapData(HtmlDocument doc)
       {
           var cards = doc.DocumentNode.SelectNodes(
"//div[contains(@class, 'uitk-card uitk-card-roundcorner-all')]");
           if (cards != null) foreach (HtmlNode node in cards)
               {
                   var n1 = node.SelectNodes(
.//div[contains(@class, 'uitk-card-content-section')]/div/div/h4[contains(@class, 'uitk-heading')]");
                   if (n1 != null) foreach (var nn1 in n1)
                       {
                           Console.WriteLine(string.Format("label> {0}", nn1.InnerText));
                       }
                   var n2 = node.SelectNodes
(".//span/div[contains(@class, 'uitk-text.uitk-type-600')]");
                   if (n2 != null) foreach (var nn in n2)
                       {
                           Console.WriteLine(string.Format("price> {0}", nn.InnerText));
                       }
                   var n3 = node.SelectNodes(".//div[contains(@class, 'uitk-price-lockup')]/section/span[contains(@class, 'uitk-lockup-price')]");
                   if (n3 != null) foreach (var nn in n3)
                       {
                           Console.WriteLine(string.Format("price> {0}", nn.InnerText));
                       }
               }
       }
       private async Task<ISet<string>> collectLinks(string link)
       {
           var newLinks = new HashSet<string>();
           var s = await client.GetStringAsync(link);
           var doc = new HtmlDocument();
           doc.LoadHtml(s);
           scrapData(doc);
           var nodes = doc.DocumentNode.SelectNodes("//a[@href]");
           if (nodes != null)
           {
               foreach (HtmlNode node in nodes)
               {
                   var v = node.GetAttributeValue("href", "");
                   try
                   {
                       var u = new Uri(v);
                       newLinks.Add(v);
                   }
                   catch { }
               }
           }
           return newLinks;
       }
       // Crawl a given site using breadth-first search algorithm
       private async Task TaskHandler(Link j)
       {
           Console.WriteLine(string.Format("visit> {1} {0}", j.uri, j.depth));
           var list = await collectLinks(j.uri);
           foreach (var e in list)
           {
               if (!visited.ContainsKey(e))
               {
                   if (maxSitesConstraint(e))
                   {
                       continue;
                   }
                   if (j.depth + 1 <= maxDepth)
                   {
                       var newJob = new Link(e, j.depth + 1);
                       visited[e] = 0;
                       queue.Enqueue(newJob);
                   }
               }
           }
       }
       // _maxDepth - maximum depth of walk tree
       // _maxSites - maximum number of sites to crawl, including an initail address
       public async Task Start(string u, int _maxDepth, int _maxSites)
       {
           maxDepth = _maxDepth;
           maxSites = _maxSites;
           var httpClientHandler = new HttpClientHandler
           {
               Proxy = new WebProxy
               {
                   Address = new Uri("http://localhost:8080"),
                   BypassProxyOnLocal = false,
                   UseDefaultCredentials = false,
               }
           };
           client = new HttpClient(handler: httpClientHandler, disposeHandler: true);
           var maxThreads = 8;
           queue = new ConcurrentQueue<Link>();
           queue.Enqueue(new Link(u, 0));
           var tasks = new List<Task>();
           for (int n = 0; n < maxThreads; n++)
           {
               tasks.Add(Task.Run(async () =>
               {
                   while (queue.TryDequeue(out Link l))
                   {
                       await RetryHelper.RetryOnExceptionAsync(5, TimeSpan.FromSeconds(5), async () => {
                           await TaskHandler(l);
                       });
                   }
               }));
           }
           await Task.WhenAll(tasks);
       }
   }
   class Program
   {
       static void Main(string[] args)
       {
           var c = new Crawler();
           try
           {
               var uri = "https://www.expedia.com/Hotel-Search?adults=2&destination=Tbilisi%2C%20Georgia&rooms=1";
               var maxDepth = 0;
               var maxSites = 1;
               // 0 - is a maximum depth of walk tree
               // 1 - maximum number of sites to crawl, including an initail address
               c.Start(uri, maxDepth, maxSites)
                   .Wait();
           }
           catch (Exception e)
           {
               Console.WriteLine(e.Message);
           }
       }
   }
}

If you’d like to extract different data you would need to change the URL in ‘var uri’ and ‘scrapData’(HtmlDocument doc)’ function, as it is used to extract and print data from the HTML document using the HtmlAgilityPack library.

HtmlAgilityPack is not added by default, so to do that, you will need to right click on your project and select “Manage NuGet Packages…”. Now in the search enter HtmlAgilityPack, pick one of the packages and install it. Once that is done, right click on your project’s name one more time and select “Add” -> “New Class” and add an “Empty Class”. Rename it to “Utils.cs”

Now paste this code into there:

using System;
using System.Threading.Tasks;
namespace Crawler
{
   class Link
   {
       public string uri;
       public int depth;
       public Link(string uri, int depth)
       {
           this.uri = uri;
           this.depth = depth;
       }
   }
   public static class RetryHelper
   {
       public static async Task RetryOnExceptionAsync(
           int times, TimeSpan delay, Func<Task> operation)
       {
           if (times <= 0)
               throw new ArgumentOutOfRangeException(nameof(times));
           var attempts = 0;
           do
           {
               try
               {
                   attempts++;
                   await operation();
                   break;
               }
               catch
               {
                   Console.WriteLine
($"Exception on attempt {attempts} of {times}. Will retry after sleeping for {delay}.");
                   if (attempts == times)
                       throw;
                   await Task.Delay(delay);
               }
           } while (true);
       }
   }
}

This code creates two new classes - "Link" will represent a link in web crawler and "RetryHelper" that offers a retry mechanism for asynchronous operations.

Now create one more file. Again, right click on your project’s name and select to “Add” -> “New file”, select XML file and Empty XML file and rename it to “Crawler-MultiThreaded.csproj”.

This file will serve as a configuration file for the C# project, defining its properties, references, and source code files necessary for building and compiling the project.
Paste this code into XML file:

<Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props"
Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')" />
 <PropertyGroup>
   <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
   <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
   <ProjectGuid>{38B7DCED-0C40-4FA6-BB32-DC6D0EC28A12}</ProjectGuid>
   <OutputType>Exe</OutputType>
   <RootNamespace>Crawler_MultiThreaded</RootNamespace>
   <AssemblyName>Crawler-MultiThreaded</AssemblyName>
   <TargetFrameworkVersion>v4.7.2</TargetFrameworkVersion>
   <FileAlignment>512</FileAlignment>
   <AutoGenerateBindingRedirects>true</AutoGenerateBindingRedirects>
   <Deterministic>true</Deterministic>
 </PropertyGroup>
 <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
   <PlatformTarget>AnyCPU</PlatformTarget>
   <DebugSymbols>true</DebugSymbols>
   <DebugType>full</DebugType>
   <Optimize>false</Optimize>
   <OutputPath>bin\Debug\</OutputPath>
   <DefineConstants>DEBUG;TRACE</DefineConstants>
   <ErrorReport>prompt</ErrorReport>
   <WarningLevel>4</WarningLevel>
 </PropertyGroup>
 <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
   <PlatformTarget>AnyCPU</PlatformTarget>
   <DebugType>pdbonly</DebugType>
   <Optimize>true</Optimize>
   <OutputPath>bin\Release\</OutputPath>
   <DefineConstants>TRACE</DefineConstants>
   <ErrorReport>prompt</ErrorReport>
   <WarningLevel>4</WarningLevel>
 </PropertyGroup>
 <ItemGroup>
   <Reference Include="HtmlAgilityPack, Version=1.11.46.0, Culture=neutral,
PublicKeyToken=bd319b19eaf3b43a, processorArchitecture=MSIL">
     <HintPath>..\packages\HtmlAgilityPack.1.11.46\lib\Net45\HtmlAgilityPack.dll</HintPath>
   </Reference>
   <Reference Include="System" />
   <Reference Include="System.Core" />
   <Reference Include="System.Xml.Linq" />
   <Reference Include="System.Data.DataSetExtensions" />
   <Reference Include="Microsoft.CSharp" />
   <Reference Include="System.Data" />
   <Reference Include="System.Net.Http" />
   <Reference Include="System.Xml" />
 </ItemGroup>
 <ItemGroup>
   <Compile Include="Utils.cs" />
   <Compile Include="Program.cs" />
   <Compile Include="Properties\AssemblyInfo.cs" />
 </ItemGroup>
 <ItemGroup>
   <None Include="App.config" />
   <None Include="packages.config" />
 </ItemGroup>
 <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />

Now you can save all files.
Congratulations! You’ve created a C# simple crawler.

Before building and running the code, remember to install the Go Simple Tunnel from https://github.com/go-gost/gost/releases or by executing the command 'brew install gost'.
Next, execute the following command to run a proxy tunnel:

gost -L=:8080
-F=https://customer-username-country-us:password@proxy.goproxies.com:10001
Once the tunnel is active, you can reopen the project and proceed with building and running it.
You should see the output like:
visit> 0 https://www.expedia.com/Hotel-Search?adults=2&destination=Tbilisi%2C%20Georgia&rooms=1
label> The Biltmore Hotel Tbilisi
price> $153
label> Shota@Rustaveli Boutique hotel
price> $133
label> Rooms Hotel Tbilisi
price> $141
label> Iota Hotel Tbilisi
price> $98
label> Holiday Inn Tbilisi, an IHG Hotel
price> $102
label> Terrace House Tbilisi
price> $25
label> Moxy Tbilisi
price> $63…

What’s a Rich Text element?

The rich text element allows you to create and format headings, paragraphs, blockquotes, images, and video all in one place instead of having to add and format them individually. Just double-click and easily create content.

Static and dynamic content editing

A rich text element can be used with static or dynamic content. For static content, just drop it into any page and begin editing. For dynamic content, add a rich text field to any collection and then connect a rich text element to that field in the settings panel. Voila!

How to customize formatting for each rich text

Headings, paragraphs, blockquotes, figures, images, and figure captions can all be styled after a class is added to the rich text element using the "When inside of" nested selector system.