I have run into an unusual issue related to CSV generation from an array of data i create through web-scraping using Puppeteer.
Through console logs in my code i can see that all the data is correctly obtained after the scrape. Thereafter, i create a CSV file mapped in a particular way with the data collected.
For some reason, not all the data is being written in the CSV file when i do a comparison with my array.
What could be the issue – code below (i have posted the part of the scrape and the code reflecting the generation of the CSV file. Can post anything relevant/required upon request
Thanks in advance
for(const prodPage of prodLinks){
if(prodPage){
await page.goto(prodPage).then(async() => {
//console.log('n' + 'Scraping ' + prodPage + 'n')
//try {
const extractProdInfo = await page.evaluate((furnitureRange, rangeLink, prodPage) => {
const handle = document.querySelector('.barcode').textContent.toString()
const title = document.querySelector('[data-ui-id="page-title-wrapper"]').textContent
const colour = document.getElementById('attr-other').querySelector('tr:first-child').querySelector('td:last-child').textContent
const desc = document.getElementById('js-product-description').textContent;
const dimensionsArray = [];
const dimensionsTable = document.getElementById('attr-dimensions');
const dimensionRows = dimensionsTable.querySelectorAll('tr');
dimensionRows.forEach(row => {
const dimensionLabel = row.querySelector('td:first-child').textContent.trim();
const dimensionValue = row.querySelector('td:last-child').textContent.trim();
const concatenatedDimension = `${dimensionLabel}: ${dimensionValue}`;
dimensionsArray.push(concatenatedDimension);
});
let detailsArray = [];
const otherDetailsTable = document.getElementById('attr-other');
const detailRows = otherDetailsTable.querySelectorAll('tr');
detailRows.forEach(row => {
const detailLabel = row.querySelector('td:first-child').textContent.trim();
const detailValue = row.querySelector('td:last-child').textContent.trim();
const concatenatedDetail = `${detailLabel}: ${detailValue}`;
detailsArray.push(concatenatedDetail);
});
// Remove the RRP detail from the array
detailsArray = detailsArray.filter(detail => !detail.startsWith('RRP'));
const body = desc + "<br><br>" + "nn<b>Dimensions:</b>n" + dimensionsArray.join("<br>") + "<br><br>" + "nn<b>Highlights:</b>n" + detailsArray.join('<br>')
//CHANGE ACCORDING TO SCRAPE TYPE
const type = 'Tables'
const categoryNo = '2158'
// // ------------------------------
const tagsArray = []
for (const range of furnitureRange) {
if (rangeLink.includes(range)) {
let capitalizedTag;
if (range.includes('-')) {
// Remove hyphens and capitalize both words
capitalizedTag = range.split('-')
.map(word => word.charAt(0).toUpperCase() + word.slice(1))
.join(' ');
} else {
// Capitalize first letter only
capitalizedTag = range.charAt(0).toUpperCase() + range.slice(1);
}
tagsArray.push(capitalizedTag);
}
}
tagsArray.push(type, colour)
const imgElements = document.querySelectorAll('.fotorama__img');
const imageURLs = [];
imgElements.forEach(img => {
const src = img.getAttribute('src');
// Filter out thumbnail URLs
if (!src.includes('thumbnail') && !src.includes('cache/c')) {
imageURLs.push(src);
}
});
let priceElement = document.getElementById('js-original-price');
let priceText;
if (priceElement) {
priceText = priceElement.textContent.trim().replace('£', '');
} else {
priceElement = document.querySelector('.pack-price');
if (priceElement) {
priceText = priceElement.textContent.trim().replace('£', '');
}
}
priceText = Math.ceil(priceText * 1.45).toFixed(2).toString()
return {
handle: handle,
title: title + ' - ' + colour,
body: body,
vendor: 'gallery-direct',
collection: type,
productCategory: categoryNo,
type: type,
tags: tagsArray.join(','),
imageSrc: imageURLs,
imagePosition: '1',
variantPrice: priceText,
variantSKU: handle,
prodURL: prodPage
};
}, furnitureRange, rangeLink, prodPage)
//console.log(allProductData)
allProductData.push(extractProdInfo)
})
}
}
//console.log(allProductData)
//Create Failed Products Data Log
function createFailedLog(outputName, errors) {
const filePath = 'failed-scraping-products.txt';
// Format the date
const dateFormatted = new Date().toISOString().split('T')[0];
// Create the batch heading
const batchHeading = outputName + ' Log - ' + dateFormatted + 'n';
// Convert the errors array to a string
const errorString = errors.join('n');
// Combine the batch heading and errors
const dataToWrite = batchHeading + errorString + 'n' + 'n';
// Check if the file exists
if (fs.existsSync(filePath)) {
// If the file exists, append the new errors with batch heading
fs.appendFileSync(filePath, dataToWrite);
} else {
// If the file doesn't exist, create a new file and write the errors with batch heading
fs.writeFileSync(filePath, dataToWrite);
}
}
console.log('n' + 'Failed Scrapes: ' + failedProdData.length)
console.log('allProductData: ' + allProductData.length)
if(failedProdData.length > 0){
console.log('Writing Failed Scraping Products log ... ')
createFailedLog(outputName,failedProdData)
}
/// Function to create CSV data from object data
const createCSVData = (data, headers, defaults) => {
const csvData = [];
// Push headers as the first row
csvData.push(headers);
data.forEach(item => {
const { imageSrc, ...rest } = item;
imageSrc.forEach((imgSrc, index) => {
const newRow = {
'Handle': item.handle, // Copy handle only for the first image
'Title': index === 0 ? item.title : '', // Copy title only for the first image
'Body (HTML)': index === 0 ? item.body : '', // Copy body only for the first image
'Vendor': index === 0 ? item.vendor : '', // Copy vendor only for the first image
'Collection': index === 0 ? item.collection : '', // Copy collection only for the first image
'Product Category': index === 0 ? item.productCategory : '', // Copy product category only for the first image
'Type': index === 0 ? item.type : '', // Copy type only for the first image
'Tags': index === 0 ? item.tags : '', // Copy tags only for the first image
'Published': index === 0 ? defaults['Published'] : '',
'Option1 Name': defaults['Option1 Name'],
'Option1 Value': defaults['Option1 Value'],
'Option2 Name': defaults['Option2 Name'],
'Option2 Value': defaults['Option2 Value'],
'Option3 Name': defaults['Option3 Name'],
'Option3 Value': defaults['Option3 Value'],
'Variant SKU': item.variantSKU,
'Variant Grams': index === 0 ? defaults['Variant Grams'] : '',
'Variant Inventory Qty': index === 0 ? defaults['Variant Inventory Qty'] : '',
'Variant Inventory Policy': index === 0 ? defaults['Variant Inventory Policy'] : '',
'Variant Fulfillment Service': index === 0 ? defaults['Variant Fulfillment Service'] : '',
'Variant Price': index === 0 ? item.variantPrice : '', // Copy variant price only for the first image
'Variant Compare At Price': index === 0 ? defaults['Variant Compare At Price'] : '',
'Variant Requires Shipping': index === 0 ? defaults['Variant Requires Shipping'] : '',
'Variant Taxable': index === 0 ? defaults['Variant Taxable'] : '',
'Image Src': imgSrc,
'Image Position': index + 1, // Increment Image Position
'Gift Card': index === 0 ? defaults['Gift Card'] : '',
'Variant Weight Unit': index === 0 ? defaults['Variant Weight Unit'] : '',
'Status': index === 0 ? defaults['Status'] : ''
};
csvData.push(Object.values(newRow));
});
});
return csvData;
};
// Create CSV data
const csvData = createCSVData(allProductData, headersArray, defaultValues);
// Convert data to worksheet
const ws = XLSX.utils.aoa_to_sheet(csvData);
console.log('All product data collected. Now generating CSV file. This may take several minutes...')
// Create workbook and add worksheet
const wb = XLSX.utils.book_new();
XLSX.utils.book_append_sheet(wb, ws, 'ScraperWebsite');
//Format CSV generated sheet
const dateFormatted = new Date().toISOString().split('T')[0];
const filename = outputName+`_${dateFormatted}.csv`;
// Write workbook to file
XLSX.writeFile(wb, filename);
console.log( 'CSV generated')
// Read the generated CSV file
const csvFilePath = filename; // Assuming filename contains the path to the generated CSV file
const readCSV = fs.readFileSync(csvFilePath, 'utf8').split('n');
// Extract handles from the CSV data
const csvHandles = readCSV.slice(1) // Exclude the header row
.map(row => row.split(',')[0]); // Assuming 'Handle' is the first column
// Find handles in allProductData that are not in the CSV file
const missingHandles = allProductData.filter(item => !csvHandles.includes(item.handle));
// Extract prodURLs for missing handles
const missingUrls = missingHandles.map(item => item.prodURL);
// Log the missing URLs
console.log('Missing URLs in CSV:');
console.log(missingUrls);