Question

dockerized and Kubernetes deployed puppeteer web scraper very slow

Connected Tutorial
This question is a follow-up to this tutorial:

I have built a puppeteer web scraper which takes a query, scrapes the result and saves it in a json. I want to create a process/pipeline to do this for >50k queries and save the results - ideally not having to wait 2 weeks for the script to finish.

I did a lot of research and decided to follow your tutorial: https://www.digitalocean.com/community/tutorials/how-to-build-a-concurrent-web-scraper-with-puppeteer-node-js-docker-and-kubernetes to build a concurrent scraper by dockerizing and deploying it to a Kubernetes cluster. It’s a great resource and so and with a lot of work adapted everything to my specific situation.

However, when I tried running it, I kept getting the error connect eaddrnotavail but I couldn’t really understand how best to solve the issue. I finally got it to work by changing my client side code slightly compared to the article by using async/await. It works but it is very slow - I feel like only one request is sent to the cluster and finished and then the next one is sent off, which isn’t really concurrent and I don’t need a cluster for that!

So now I’m asking myself what is wrong or how can I improve my code to speed it up. The way I see it it can be done a lot faster but I must be missing something compared to your code which works so fast. My client side code:

let axios = require('axios')
let ldb = require('./lowdbHelper.js').LowDbHelper
let ldbHelper = new ldb()
let allAddresses = ldbHelper.getData()
const fs = require('fs');
const csv = require('csv-parser');


let fname = 'Altbau_new.csv';
let server = "http://123.456.789.000"
let podsWorkDone = []
let addressesDetails = []
let errors = []

function main() {

  getDetails()

}

async function getDetails(){
  let begin = Date.now()
  for (let j = 0; j < allAddresses.length; j++) {
    try{
  let data = {
  url: 'https://mein.wien.gv.at/Meine-Amtswege/richtwert?subpage=/lagezuschlag/',
  addr: allAddresses[j],
  commands: [{description: 'scrape', type: 'scrape'}]
  }

  await sendRequest(data, function (result) {
    parseResult(result, begin)
  })
  } catch (e){
    continue
  }
  }
}

async function sendRequest(payload, cb) {
  let address = payload
  try {
    await axios.post(`${server}/api/addresses`, address).then(response => {
      if (Object.keys(response.data).includes('error')) {
        let res = {
          address: address.addr,
          error: response.data.error
        }
        cb(res)
      } else {
        cb(response.data)
      }
    })
  } catch (error) {
    console.log(error)
    let res = {
      address: address.addr,
      error: error
    }
    cb({ res })
  }
}

function parseResult(result, begin){
  try {
    let end = Date.now()
    let timeSpent = (end - begin) / 1000 + "secs ";
    if (!Object.keys(result).includes("error")) {
      let wasSuccessful = Object.keys(result.zuschlag).length > 0 ? true : false
      if (wasSuccessful) {
        let podID = result.hostname
        let podsIDs = podsWorkDone.length > 0 ? podsWorkDone.map(pod => { return Object.keys(pod)[0]}) : []
        if (!podsIDs.includes(podID)) {
          let podWork = {}
          podWork[podID] = 1
          podsWorkDone.push(podWork)
        } else {
          for (let pwd = 0; pwd < podsWorkDone.length; pwd++) {
            if (Object.keys(podsWorkDone[pwd]).includes(podID)) {
              podsWorkDone[pwd][podID] += 1
              break
            }
          }
        }
        addressesDetails.push(result)
      } else {
        errors.push(result)
      }
    } else {
      errors.push(result)
    }
    console.log('podsWorkDone', podsWorkDone, ', retrieved ' + addressesDetails.length + " addresses, ",
      "took " + timeSpent + ", ", "used " + podsWorkDone.length + " pods,", " errors: " + errors.length)
    ldbHelper.saveData(addressesDetails)
  } catch (error) {
    console.log(error)
  }
}

main()

I included await at the sendRequest function and made the getDetails function async otherwise it would not work. My server side:

const express = require('express');
const bodyParser = require('body-parser')
const os = require('os');

const PORT = 5000;
const app = express();
let timeout = 1500000

app.use(bodyParser.urlencoded({ extended: true }))
app.use(bodyParser.json())

let browsers = 0
let maxNumberOfBrowsers = 5

app.get('/', (req, res) => {
  console.log(os.hostname())
  let response = {
    msg: 'hello world',
    hostname: os.hostname().toString()
  }
  res.send(response);
});

app.post('/api/addresses', async (req, res) => {
  req.setTimeout(timeout);
  try {
    let data = req.body
    console.log(req.body.url)
    while (browsers == maxNumberOfBrowsers) {
      await sleep(1000)
    }
    await getAddressesHandler(data).then(result => {
      let response = {
        msg: 'retrieved addresses ',
        hostname: os.hostname(),
        addr: data.addr,
        zuschlag: result

      }
      console.log('done')
      res.send(response)
    })
  } catch (error) {
    res.send({ error: error.toString() })
  }
});

async function getAddressesHandler(arg) {
  let pMng = require('./puppeteerManager')
  let puppeteerMng = new pMng.PuppeteerManager(arg)
  browsers += 1
  try {
    let addresses = await puppeteerMng.getAllAddresses().then(result => {
      return result
    })
    browsers -= 1
    return addresses
  } catch (error) {
    browsers -= 1
    console.log(error)
  }
}

function sleep(ms) {
    console.log(' running maximum number of browsers')
    return new Promise(resolve => setTimeout(resolve, ms))
  }

  app.listen(PORT);
console.log(`Running on port: ${PORT}`);

many thanks in advance


Submit an answer


This textbox defaults to using Markdown to format your answer.

You can type !ref in this text area to quickly search our full set of tutorials, documentation & marketplace offerings and insert the link!

Sign In or Sign Up to Answer

These answers are provided by our Community. If you find them useful, show some love by clicking the heart. If you run into issues leave a comment, or add your own answer to help others.

Accepted Answer

Using await before the sendRequest() function is a big no because when you use await the code will be forced to wait for the sendRequest() function to finishing executing. You mentioned that you have more than 50K queries, so maybe the client can’t send that many requests at once. So my advice would be to remove the await in the sendRequest() function and only send 400 requests at a time to the server, if that succeeds try increasing the number of requests that the client makes to see what is the maximum number of requests that the client can make. Also, check this question I found on StackOverflow , it might help you with the EADDRNOTAVAIL issue. You can reach me at sam.b.russian@gmail.com if you need more help.

Try DigitalOcean for free

Click below to sign up and get $200 of credit to try our products over 60 days!

Sign up

Get our biweekly newsletter

Sign up for Infrastructure as a Newsletter.

Hollie's Hub for Good

Working on improving health and education, reducing inequality, and spurring economic growth? We'd like to help.

Become a contributor

Get paid to write technical tutorials and select a tech-focused charity to receive a matching donation.

Welcome to the developer cloud

DigitalOcean makes it simple to launch in the cloud and scale up as you grow — whether you're running one virtual machine or ten thousand.

Learn more
DigitalOcean Cloud Control Panel