Question
dockerized and Kubernetes deployed puppeteer web scraper very slow
I have built a puppeteer web scraper which takes a query, scrapes the result and saves it in a json. I want to create a process/pipeline to do this for >50k queries and save the results - ideally not having to wait 2 weeks for the script to finish.
I did a lot of research and decided to follow your tutorial: https://www.digitalocean.com/community/tutorials/how-to-build-a-concurrent-web-scraper-with-puppeteer-node-js-docker-and-kubernetes to build a concurrent scraper by dockerizing and deploying it to a Kubernetes cluster. It’s a great resource and so and with a lot of work adapted everything to my specific situation.
However, when I tried running it, I kept getting the error connect eaddrnotavail but I couldn’t really understand how best to solve the issue. I finally got it to work by changing my client side code slightly compared to the article by using async/await. It works but it is very slow - I feel like only one request is sent to the cluster and finished and then the next one is sent off, which isn’t really concurrent and I don’t need a cluster for that!
So now I’m asking myself what is wrong or how can I improve my code to speed it up. The way I see it it can be done a lot faster but I must be missing something compared to your code which works so fast. My client side code:
let axios = require('axios')
let ldb = require('./lowdbHelper.js').LowDbHelper
let ldbHelper = new ldb()
let allAddresses = ldbHelper.getData()
const fs = require('fs');
const csv = require('csv-parser');
let fname = 'Altbau_new.csv';
let server = "http://123.456.789.000"
let podsWorkDone = []
let addressesDetails = []
let errors = []
function main() {
getDetails()
}
async function getDetails(){
let begin = Date.now()
for (let j = 0; j < allAddresses.length; j++) {
try{
let data = {
url: 'https://mein.wien.gv.at/Meine-Amtswege/richtwert?subpage=/lagezuschlag/',
addr: allAddresses[j],
commands: [{description: 'scrape', type: 'scrape'}]
}
await sendRequest(data, function (result) {
parseResult(result, begin)
})
} catch (e){
continue
}
}
}
async function sendRequest(payload, cb) {
let address = payload
try {
await axios.post(`${server}/api/addresses`, address).then(response => {
if (Object.keys(response.data).includes('error')) {
let res = {
address: address.addr,
error: response.data.error
}
cb(res)
} else {
cb(response.data)
}
})
} catch (error) {
console.log(error)
let res = {
address: address.addr,
error: error
}
cb({ res })
}
}
function parseResult(result, begin){
try {
let end = Date.now()
let timeSpent = (end - begin) / 1000 + "secs ";
if (!Object.keys(result).includes("error")) {
let wasSuccessful = Object.keys(result.zuschlag).length > 0 ? true : false
if (wasSuccessful) {
let podID = result.hostname
let podsIDs = podsWorkDone.length > 0 ? podsWorkDone.map(pod => { return Object.keys(pod)[0]}) : []
if (!podsIDs.includes(podID)) {
let podWork = {}
podWork[podID] = 1
podsWorkDone.push(podWork)
} else {
for (let pwd = 0; pwd < podsWorkDone.length; pwd++) {
if (Object.keys(podsWorkDone[pwd]).includes(podID)) {
podsWorkDone[pwd][podID] += 1
break
}
}
}
addressesDetails.push(result)
} else {
errors.push(result)
}
} else {
errors.push(result)
}
console.log('podsWorkDone', podsWorkDone, ', retrieved ' + addressesDetails.length + " addresses, ",
"took " + timeSpent + ", ", "used " + podsWorkDone.length + " pods,", " errors: " + errors.length)
ldbHelper.saveData(addressesDetails)
} catch (error) {
console.log(error)
}
}
main()
I included await at the sendRequest function and made the getDetails function async otherwise it would not work. My server side:
const express = require('express');
const bodyParser = require('body-parser')
const os = require('os');
const PORT = 5000;
const app = express();
let timeout = 1500000
app.use(bodyParser.urlencoded({ extended: true }))
app.use(bodyParser.json())
let browsers = 0
let maxNumberOfBrowsers = 5
app.get('/', (req, res) => {
console.log(os.hostname())
let response = {
msg: 'hello world',
hostname: os.hostname().toString()
}
res.send(response);
});
app.post('/api/addresses', async (req, res) => {
req.setTimeout(timeout);
try {
let data = req.body
console.log(req.body.url)
while (browsers == maxNumberOfBrowsers) {
await sleep(1000)
}
await getAddressesHandler(data).then(result => {
let response = {
msg: 'retrieved addresses ',
hostname: os.hostname(),
addr: data.addr,
zuschlag: result
}
console.log('done')
res.send(response)
})
} catch (error) {
res.send({ error: error.toString() })
}
});
async function getAddressesHandler(arg) {
let pMng = require('./puppeteerManager')
let puppeteerMng = new pMng.PuppeteerManager(arg)
browsers += 1
try {
let addresses = await puppeteerMng.getAllAddresses().then(result => {
return result
})
browsers -= 1
return addresses
} catch (error) {
browsers -= 1
console.log(error)
}
}
function sleep(ms) {
console.log(' running maximum number of browsers')
return new Promise(resolve => setTimeout(resolve, ms))
}
app.listen(PORT);
console.log(`Running on port: ${PORT}`);
many thanks in advance
These answers are provided by our Community. If you find them useful, show some love by clicking the heart. If you run into issues leave a comment, or add your own answer to help others.
×