I’m using NodeJs and Puppeteer to scrap data from an website.
My goal is to run multiple instances of Puppeteer when scraping data.
However, i may be doing something wrong. Because it is running only one instances, and sending multiple data at once.
Here is what is my code do:
- Open an XLSX file from the computer
- Do some processing and then read the file into the memory(a list of numCpfCnpj and numAcordo)
- Separate the entries in batches of 5, for multiple processing.
- Call puppeteer to login on the page and and search for the numCpfCnpj.
- After searching the numCpfCnpj, should search for numAcordo and continue with the logic.
However, i can’t figure out how to run more than one instance. It it possible with Puppeteer?
Or should i just throw the script in a cloud service (like AWS Lambda) and run multiple instances of without worrying about anything else?
Here is the code that calls the methods:
const puppeteer = require('puppeteer');
const XLSX = require('xlsx');
const handleLogin = async (page, login, password) => {
await page.goto('https://www.santandernegocios.com.br/portaldenegocios/#/externo');
await page.waitForSelector('#userLogin__input');
await page.waitForSelector('#userPassword__input');
await page.type('#userLogin__input', login);
await page.type('#userPassword__input', password);
await page.focus('#userPassword__input');
await page.keyboard.press('Enter');
await page.waitForNavigation();
};
const searchAndNavigate = async (page, cpfCnpj) => {
await page.waitForSelector('input[name="inputSearch"]');
// Ensure cpfCnpj is a string
if (typeof cpfCnpj !== 'string') {
throw new TypeError('cpfCnpj must be a string');
}
await page.type('input[name="inputSearch"]', cpfCnpj);
await page.waitForFunction(
(selector, text) => {
const input = document.querySelector(selector);
return input && input.value === text;
},
{},
'input[name="inputSearch"]',
cpfCnpj
);
await page.focus('input[name="inputSearch"]');
await page.keyboard.press('Enter');
await page.waitForNavigation({ waitUntil: 'networkidle2' });
};
const clickMenuLink = async (page, menuTexts) => {
await page.evaluate(menuTexts => {
const sessionMenu = document.querySelector('div.session-menu');
if (sessionMenu) {
const ul = sessionMenu.querySelector('ul');
if (ul) {
for (let text of menuTexts) {
const link = Array.from(ul.querySelectorAll('li a')).find(anchor => anchor.textContent.trim() === text);
if (link) {
link.click();
return;
}
}
}
}
}, menuTexts);
};
const processXlsxFile = async (filePath) => {
const workbook = XLSX.readFile(filePath);
const sheet = workbook.Sheets[workbook.SheetNames[0]];
const data = XLSX.utils.sheet_to_json(sheet, { header: 1 });
const rows = data.slice(1); // Skip header row
const groupedEntries = [];
const groupSize = 5;
for (let i = 0; i < rows.length; i += groupSize) {
groupedEntries.push(rows.slice(i, i + groupSize).map(row => ({
numCpfCnpj: String(row[0]), // Ensure it's a string
numAcordo: String(row[1]) // Ensure it's a string
})));
}
return groupedEntries;
};
const processEntries = async (page, entries, menuTexts) => {
await Promise.all(entries.map(async (entry) => {
try {
console.log(entry.numCpfCnpj);
await searchAndNavigate(page, entry.numCpfCnpj);
await clickMenuLink(page, menuTexts);
console.log(`Processed: ${entry.numCpfCnpj} - ${entry.numAcordo}`);
} catch (error) {
console.error(`Error processing ${entry.numCpfCnpj} - ${entry.numAcordo}:`, error);
}
}));
};
const runPuppeteer = async (filePath, login, password, menuTexts) => {
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();
try {
await handleLogin(page, login, password);
const entries = await processXlsxFile(filePath);
// Process each group of entries with a limit of 5 simultaneous tasks
for (const group of entries) {
// Process each group with a maximum of 5 simultaneous tasks
const chunkSize = 5;
for (let i = 0; i < group.length; i += chunkSize) {
const chunk = group.slice(i, i + chunkSize);
await processEntries(page, chunk, menuTexts);
}
}
} catch (error) {
console.error('Error occurred:', error);
} finally {
await browser.close();
}
};
module.exports = { runPuppeteer };
Thanks a lot. If any additional information is needed i’ll reply as soon as i see the notitification.