How to crawl websites with Node.js?

Updated:

July 5, 2024

How to crawl websites with Node.js?

Updated:

July 5, 2024

In this article we will explain how to crawl websites with Node.js We will use examples of how this script could help extract information from a travel business website listing hotels in Amsterdam. It uses a method called web scraping to explore the website in a systematic way, starting from the main page and gradually moving to other pages. The script is designed to gather data by following links and collecting specific details from each hotel page it encounters.

Firstly, let’s set up a Node.js project. To do so, you need to run the following command:


mkdir nodejs-scraper

Now you've created an empty directory for your web-scraper-nodejs project. You can name this project folder whatever you'd like.
To enter the folder you can use the following command:


cd nodejs-scraper

For the next step you will need to use npm (Node Package Manager) - a tool that comes bundled with the Node.js runtime environment. It is a package manager that allows developers to easily install, manage, and share reusable code packages, also known as modules or libraries.
Initialize a npm project with command:


npm init -y

This command will set up a new npm project for you. To quickly set up a default project with npm, you'll need to include the -y flag. This flag tells npm to skip the interactive setup process and automatically initialize the project with default settings. If you don't include the -y flag, the terminal will prompt you with some questions that you need to answer.
After completing these steps in the nodejs-scraper directory you should have a package.json file available. Now you can create a JS file (we named in app_parallel.js) and copy this example of the code to crawl our chosen website:


const request = require('request');
const url = require('url');
const util = require('util');
const cheerio = require('cheerio');
const Queue = require('async-parallel-queue');
let keepaliveAgent = require('keepalive-proxy-agent')

let agent = new keepaliveAgent({proxy:{host:"localhost", port:8080}})
var opts = {
 'agent': agent,
 'headers': {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) 
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'},
}

const requestPromise = util.promisify(request);

/* private */
var visited = {}; // set
var jobs;         // queue
var hosts = {};   // set

function numberOfKeys(o) { return Object.keys(o).length }
function delay(time) {
 return new Promise(resolve => setTimeout(resolve, time));
}

async function retry(fn, n) {
 for (let i = 0; i < n; i++) {
   try {
     return await fn();
   } catch (e) {
     console.log('retry>', i, e.message, e.message == 'HTTP/1.1 503' ? '(Wrong Proxy Authentication)':'');
   }
   await delay(3000);
 }
 throw new Error(`Failed retrying ${n} times`);
}

// maxSitesConstraint returns true if we have to skip the given link
function maxSitesConstraint(e) {
   var uri = url.parse(e);
   if (!(uri.host in hosts)) {
       if (numberOfKeys(hosts) < maxSites) {
           hosts[uri.host] = 1
       } else {
           return true
       }
   }
   return false
}

function scrapData($) {
   const cards = $("div.uitk-card.uitk-card-roundcorner-all")
   cards.each((index, value) => {
       const _l1 = $(value).find("div.uitk-card-content-section > div > div > h4.uitk-heading")
       _l1.each((index, value) => {
           console.log("label>", $(value).text())
       })
       const _l2 = $(value).find("span > div.uitk-text.uitk-type-600")
       _l2.each((index, value) => {
           console.log("price>", $(value).text())
       })
       const _l3 = $(value).find("div.uitk-price-lockup > section > span.uitk-lockup-price")
       _l3.each((index, value) => {
           console.log("price>", $(value).text())
       })
   })
}

async function collectUrls(uri) {
   let newLinks = {}

   var response = await retry(() => requestPromise(uri, opts), 5);
   if (response.statusCode !== 200) {
       console.log("Error occurred while fetching data, status:", response.statusCode)
       return newLinks
   }

   const $ = cheerio.load(response.body)
   scrapData($)

   const links = $("a")
   links.each((index, value) => {
       const l = $(value).attr("href")
       if (l === undefined) {
           return
       }

       let l_ = url.parse(l)
       if (l_.host != null) {
           newLinks[l] = 1
       }
   })
   return newLinks
}

async function taskHandler(job) {
   console.log("visit> %s %s", job.depth, job.url)

   try {
       var list = await collectUrls(job.url)
       for (let e in list) {
           if (!(e in visited)) {
               if (maxSitesConstraint(e)) {
                   continue
               }
               if (job.depth + 1 <= maxDepth) {
                   let newJob = { url: e, depth: job.depth + 1 }
                   visited[e] = newJob
                   jobs.add(async () => taskHandler(newJob));
               }
           }
       }
   } catch (e) {
     console.log('Error >', e.message);
   }
}


// Crawl a given site using breadth-first search algorithm
async function crawl(root) {
   jobs = new Queue({ concurrency: 10 });
   jobs.add(async () => taskHandler({ url: root, depth: 0 }));
   await jobs.waitIdle();
}

const maxDepth = 0;
const maxSites = 1;

async function main() {
   let url = 'https://www.expedia.com/Hotel-Search?=one-key-onboarding-dialog
&adults=2&children=&destination=Amsterdam%20City%20Centre%2C%20
Amsterdam%2C%20North%20Holland%2C%20Netherlands&endDate=2023-05-
19&latLong=52.371613%2C4.899732&mapBounds=&pwaDialog=
®ionId=6200211&rooms=1&semdtl=&sort=RECOMMENDED&startDate=2023-05-
18&theme=&useRewards=false&userIntent='
   await crawl(url)
}

main()
   .then(v => console.log(v))
   .catch(err => console.error(err));

You can change the URL in the async main function and change the ‘scrapData($)’ function if you’d like to extract different data. This function extracts the specified data by selecting elements using CSS selectors.
Don't forget that before you can run the code, you will need to install the Go Simple Tunnel from https://github.com/go-gost/gost/releases. You can also do that by running ‘brew install gost’ command.
After this, you can run a proxy tunnel using the following command:


gost -L=:8080 
-F=https://customer-username-country-us:[email protected]:10001

After all these steps you are ready to run the script and receive back the price and name of the hotel.
To do that, use the following command:


node app_parallel.js (or any name you've chose for the file)

Your output should look like this:


label> Luxury Suites Amsterdam
price> $560
label> DoubleTree by Hilton Hotel Amsterdam Centraal Station
price> $344
label> Inntel Hotels Amsterdam Landmark
price> $289
label> Kimpton De Witt Amsterdam, an IHG Hotel
price> $414
label> Radisson Blu Hotel, Amsterdam City Center
price> $445
label> W Amsterdam
price> $492

Congratulations! You have successfully learned how to crawl websites using Node.js

‍

Guoda Sulcaite

Turn data insights into growth with GoProxies

Millions of IPs are just a click away!

Try now!



What’s a Rich Text element?

The rich text element allows you to create and format headings, paragraphs, blockquotes, images, and video all in one place instead of having to add and format them individually. Just double-click and easily create content.

Static and dynamic content editing

A rich text element can be used with static or dynamic content. For static content, just drop it into any page and begin editing. For dynamic content, add a rich text field to any collection and then connect a rich text element to that field in the settings panel. Voila!

How to customize formatting for each rich text

Headings, paragraphs, blockquotes, figures, images, and figure captions can all be styled after a class is added to the rich text element using the "When inside of" nested selector system.

FAQ

What Are Rotating Residential Proxies?

Rotating Residential Proxies offer you the best solution for scaling your scraping without getting blocked.

Rotating proxies provide a different IP each time you make a request. With this automated rotation of IPs, you get unlimited scraping without any detection. It provides an extra layer of anonymity and security for higher-demand web scraping needs.

IP addresses change automatically, so after the initial set up you’re ready to scrape as long and much as you need. IPs may shift after a few hours, a few minutes or after each session depending on your configuration. We do this by pulling legitimate residential IPs from our pool.

Why Do You Need Rotating Residential Proxies?

There are a number of use cases for rotating residential proxies. One of the most common ones is bypassing access limitations.

Some websites have specific measures in place to block IP access after a certain number of requests over an extended period of time.

This limits your activity and hinders scalability. With rotating residential IP addresses, it's almost impossible for websites to detect that you are the same user, so you can continue scraping with ease.

When to Use Static Residential Proxies Instead?

There are particular cases where static residential proxies may be more useful for your needs, such as accessing services that require logins.

Rotating IPs might lead to sites not functioning well if they are more optimised for regular use from a single IP.

Learn if our static residential proxies are a better fit for your needs.

Can I choose the IP location by city?

Yes. GoProxies has IPs spread across almost every country and city worldwide.

Can I choose the IP location by country state?

Yes. GoProxies has IPs spread across X countries with localised IPs in every state.

How do you crawl a website?

To crawl a website, you use a web crawler or spider, which is a program that systematically navigates through web pages, follows links, and collects data from the site. You typically start from a specific URL, then recursively visit linked pages, extracting information as you go.

What is js crawling?

JS crawling, or JavaScript crawling, involves the process of crawling and scraping websites that heavily rely on JavaScript to load and display content. Unlike traditional HTML-based crawling, JS crawling requires tools or techniques that can execute JavaScript code to access and extract data from web pages that use dynamic rendering.

How long does it take to crawl a website?

The time it takes to crawl a website varies widely, depending on factors like the website's size, complexity, and your crawling speed. Small sites may take minutes, while large ones with millions of pages could take hours or even days.

How to crawl websites with Node.js?

How to crawl websites with Node.js?

What’s a Rich Text element?

Static and dynamic content editing

How to customize formatting for each rich text

Guide to Web Scraping Hotel Prices

Facebook Multiple Accounts: How to Manage Them

How to Hide Your IP Address? Proxies and Other Ways

FAQ

How do you crawl a website?

What is js crawling?

How long does it take to crawl a website?