I am trying to scrape news articles from this website using Apify Puppeteer crawler. I am trying to write a page-function in javascript to achieve the same but unable get it to work.
The webpage incorporates pagination so I am trying to navigate through all the relevant pages to extract all articles with publication date since ‘2024-08-15’.
I am new to javascript, so any help is highly appreciated. The page-function code I was trying to write is given below.
Can someone please help me to fix the page-function in javascript to use it in Apify Puppeteer scraper.
async function pageFunction(context) {
const { request, $, log, jQuery } = context;
const results = [];
const url = context.request['url'];
const pageopt = 'home';
const pageoption = context.customData.pageopt;
let i = 1;
//Define startDate
const startDate = new Date('2024-08-15');
//Get the page object from the context
const { page } = context;
await page.goto(url, { waitUntil: "domcontentloaded" });
const litags = null;
if ( pageoption === 'True') {
try {
await page.waitForSelector('ul.bwNewsList li');
liTags = await page.$$('ul.bwNewsList li');
} catch (error) {
console.error('An error occurred: ', error);
}
}
else {
try {
await page.waitForSelector('ul.bwNewsList li');
liTags = await page.$$('ul.bwNewsList li');
} catch(error) {
console.error('An error occurred: ', error);
}
}
const urls = [];
let date = null;
for (let li of liTags) {
date = new Date(await li.$eval('time', el => el.getAttribute('datetime')));
const aTag = await li.$('a');
const href = aTag ? await (await aTag.getPropert('href')).jsonValue() : null;
if (date >= startDate) {
urls.push(href);
}
}
console.log(date)
console.log(startDate)
if (date >= startDate) {
const nxthref = await page.$eval('div.pagingNext a', element => element.href);
console.log(nxthref);
context.customData.pageopt = 'False';
await context.enqueRequest({ 'url' : nxthref});
}
for (let url of urls) {
await page.goto(url, { waitUntil: " domcontentloaded" });
let title = await page.$eval('title', el => el.textContent.replace(/[rns]+/g, " ") );
const unique_article = url.concat(title);
// Generate the hash of the title
const titleHash = await page.evaluate(async (unique_article) => {
const encoder = new TextEncoder();
const data = encoder.encode(unique_article);
const hash = await window.crypto.subtle.digest('SHA-256', data);
const hashArray = Array.from(new Uint8Array(hash) );
return hashArray.map(b => b.toString(16).padStart(2, '0')).join('');
}, unique_article);
const pgdate = new Date(await page.$eval('div.bw-release-timestamp time', el => el.getAttribute('datetime')));
const divs = await page.$$('p');
let text = '';
for (let div of divs) {
text += await page.evaluate(el => el.textContent, div);
}
text = text.replace(/[rns]+/g, " ");
results.push({
'article_id': titleHash,
'url': url,
'title': title,
'content': text,
'published_date': pgdate,
'publisher': 'BW'
});
}
return results;
}
Thanks in advance.